User Tools

Site Tools


independent:doc2doku

This is an old revision of the document!


Converting MS Word files to Dokuwiki format

Known problems:

  • Not language-safe: only works with cp1252 encoding.

Word -> text: antiword

See antiword.

Text -> Dokuwiki: interlacing footnotes

interlace-footnotes.rb
#!/usr/bin/ruby
 
# doc2doku
# takes in raw antiword output and converts, to some extent, to dokuwiki format (including interlacing footnotes)
 
DELIMITER=/\-{23}\n/
patterns = { # cp1252
  "—" => "---",
  "–" => "--",
  "‘" => "'",
  "’" => "'",
  "“" => '"',
  "”" => '"',
  "…" => "..."
}
 
# functions
 
class String
  def title?
    return self =~ /^=.*=$/
  end
  def quote?
    return self =~ /^>/
  end
  def blank?
    return self.strip.empty?
  end
  def table?
    self =~ /^[|^]/
  end
end
 
# find footnotes
 
inputname = ARGV[0]
 
input = File.readlines(inputname)
 
footnotes=false
input.each do |line|
  if footnotes
    if line =~ /\[(\d+)\]\s(.*)\n/
      # mew footnote
      footnotes[$1.to_i] = $2
     elsif $2 and $2.strip.length > 0
      footnotes[length-1] += $2
    end
  elsif line =~ DELIMITER
    footnotes=[]
  end
end
 
# insert footnotes in text
 
if footnotes
  input.each do |line|
    if line =~ /\[(\d{1,3})\]/
      number = $1.to_i
      if footnotes[number]
        match = $&
        line.sub!(match, "((#{footnotes[number]}))")
        redo
      end
    end
  end
elsif
  puts "No footnotes found."
end
 
# fix crap
 
input.each do |line|
  patterns.each do |broken, fixed|
    if line.include? broken
      line.sub!(broken, fixed)
      redo
    end
  end
end
 
# quotes and titles
 
input.each_with_index do |line, index|
  if line =~ /^\s{4}(.*)$/
    input[index] = "> #{$1}\n"
  elsif line =~ /^\s{3}(.*)$/
    input[index] = "======#{$1}======\n"
  end
end
 
# output everything
 
input.each_with_index do |line, index|
  if line =~ DELIMITER then exit end
 
  puts line
 
  # do we need another line afterwards?
  # NO if:
  # the subsequent or current line is blank -- we never need two
  # the subsequent AND current line is a table
  # the subsequent AND current line is a quote
 
  previous = input[index-1]
  subsequent = input[index+1]
 
  next unless subsequent
 
  unless line.blank? or subsequent.blank? or
    (line.table? and subsequent.table?) or
    (line.quote? and subsequent.quote?)
    puts ""
  end
end

Wrapper

doc2doku.rb
#!/usr/bin/ruby
 
unless ARGV[0]
	puts "Usage: #{$0} file"
	exit
end
 
file = ARGV[0]
if file =~ /^(.*)\.doc$/ then stripped=$1 else stripped=file end
 
# .doc -> text: antiword
 
command = "antiword -w 0 -m cp1251 #{file} >#{stripped}.tmp"
puts "Executing #{command}"
puts `#{command}`
 
# text -> dokuwiki: interlace-footnotes.rb
 
command = "ruby interlace-footnotes.rb #{stripped}.tmp >#{stripped}.wiki"
puts "Executing #{command}"
puts `#{command}`
independent/doc2doku.1300400404.txt.gz · Last modified: 2019/11/07 17:58 (external edit)