====== Converting MS Word files to Dokuwiki format ======
Known problems:
* Not language-safe: only works with cp1252 encoding.
* [[WP>Antiword]] only supports Word formats up to 2003 (catdoc may be preferable).
====== Word -> text: antiword ======
See [[WP>antiword]].
====== Text -> Dokuwiki: interlacing footnotes ======
#!/usr/bin/ruby
# doc2doku
# takes in raw antiword output and converts, to some extent,
# to dokuwiki format (including interlacing footnotes)
DELIMITER=/\-{23}\n/
patterns = { # cp1252
"—" => "---",
"–" => "--",
"‘" => "'",
"’" => "'",
"“" => '"',
"”" => '"',
"…" => "..."
}
# functions
class String
def title?
return self =~ /^=.*=$/
end
def quote?
return self =~ /^>/
end
def blank?
return self.strip.empty?
end
def table?
self =~ /^[|^]/
end
end
# find footnotes
inputname = ARGV[0]
input = File.readlines(inputname)
footnotes=false
input.each do |line|
if footnotes
if line =~ /\[(\d+)\]\s(.*)\n/
# mew footnote
footnotes[$1.to_i] = $2
elsif $2 and $2.strip.length > 0
footnotes[length-1] += $2
end
elsif line =~ DELIMITER
footnotes=[]
end
end
# insert footnotes in text
if footnotes
input.each do |line|
if line =~ /\[(\d{1,3})\]/
number = $1.to_i
if footnotes[number]
match = $&
line.sub!(match, "((#{footnotes[number]}))")
redo
end
end
end
elsif
puts "No footnotes found."
end
# fix crap
input.each do |line|
patterns.each do |broken, fixed|
if line.include? broken
line.sub!(broken, fixed)
redo
end
end
end
# quotes and titles
input.each_with_index do |line, index|
if line =~ /^\s{4}(.*)$/
input[index] = "> #{$1}\n"
elsif line =~ /^\s{3}(.*)$/
input[index] = "======#{$1}======\n"
end
end
# output everything
input.each_with_index do |line, index|
if line =~ DELIMITER then exit end
puts line
# do we need another line afterwards?
# NO if:
# the subsequent or current line is blank -- we never need two
# the subsequent AND current line is a table
# the subsequent AND current line is a quote
previous = input[index-1]
subsequent = input[index+1]
next unless subsequent
unless line.blank? or subsequent.blank? or
(line.table? and subsequent.table?) or
(line.quote? and subsequent.quote?)
puts ""
end
end
====== Wrapper ======
#!/usr/bin/ruby
unless ARGV[0]
puts "Usage: #{$0} file"
exit
end
file = ARGV[0]
if file =~ /^(.*)\.doc$/ then stripped=$1 else stripped=file end
# .doc -> text: antiword
command = "antiword -w 0 -m cp1251 #{file} >#{stripped}.tmp"
puts "Executing #{command}"
puts `#{command}`
# text -> dokuwiki: interlace-footnotes.rb
command = "ruby interlace-footnotes.rb #{stripped}.tmp >#{stripped}.wiki"
puts "Executing #{command}"
puts `#{command}`