Sha256: d74fae95b2978c89d82336e38fa66bc9adc7dc01abfb906f8b2afaa9078038da

Contents?: true

Size: 1.3 KB

Versions: 1

Compression:

Stored size: 1.3 KB

Contents

#Word 97-2003

PlainTextExtractor.new {
  every :doc, :dot
  as "application/msword"
  aka "Microsoft Office Word document"
  extract_content_with "antiword SOURCE" => :on_linux_and_mac_os,
                       "some other command" => :on_windows
  which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
  or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
}

#Word 2007

require 'zip/zip'
PlainTextExtractor.new {
  every :docx, :dotx
  as 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
  aka "Microsoft Office 2007 Word document"
  extract_content_with {|source|
    Zip::ZipFile.open(source){|zipfile|
      zipfile.read("word/document.xml").split(/</).grep(/^w:t/).collect{|l|
        l.sub(/^[^>]+>/,'')
      }.join("\n")
    }
  }
  which_should_for_example_extract 'Can this office 2007 document be indexed\?', :from => 'office2007-word.docx'
  or_extract 'Basic Word 2007 template for Picolena specs', :from => 'office2007-word-template.dotx'
}

## Microsoft Word to text conversion:
##   Program: antiword
##   Version tested: 0.37
##   Installation: Ubuntu antiword package
##   Home page: http://www.winfield.demon.nl/

## MS OOXML word to text conversion:
## Ruby code written by Eric DUMINIL

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
picolena-0.2.2 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb