Sha256: d57e2521379a0cf7a84af6f38f0b3449b45675e9002f8de21c9cbd85e2abc3bf

Contents?: true

Size: 1.25 KB

Versions: 6

Compression:

Stored size: 1.25 KB

Contents

#Word 97-2003

PlainTextExtractor.new {
  every :doc, :dot
  as "application/msword"
  aka "Microsoft Office Word document"
  with "antiword SOURCE" => :on_linux_and_mac_os,
       "some other command" => :on_windows
  which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
  or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
}

#Word 2007

require 'zip/zip'
PlainTextExtractor.new {
  every :docx, :dotx
  as 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
  aka "Microsoft Office 2007 Word document"
  with {|source|
    Zip::ZipFile.open(source){|zipfile|
      zipfile.read("word/document.xml").split(/</).grep(/^w:t/).collect{|l|
        l.sub(/^[^>]+>/,'')
      }.join("\n")
    }
  }
  which_should_for_example_extract 'Can this office 2007 document be indexed\?', :from => 'office2007-word.docx'
  or_extract 'Basic Word 2007 template for Picolena specs', :from => 'office2007-word-template.dotx'
}

## Microsoft Word to text conversion:
##   Program: antiword
##   Version tested: 0.37
##   Installation: Ubuntu antiword package
##   Home page: http://www.winfield.demon.nl/

## MS OOXML word to text conversion:
## Ruby code written by Eric DUMINIL

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
picolena-0.1.4 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
picolena-0.1.5 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
picolena-0.1.6 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
picolena-0.1.7 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
picolena-0.1.8 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
picolena-0.2.0 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb