require 'plain_text_extractor_DSL' # PlainTextExtractor is the class responsible for extracting plain text contents from # different documents filetypes (.doc, .html, .pdf, .od?), as defined in # lib/plain_text_extractors/*.rb class PlainTextExtractor include PlainTextExtractorDSL class< content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb) Picolena::UseLanguageRecognition, # Is a language guesser already installed? PlainTextExtractor.language_guesser, # Language recognition is too unreliable for small files. content.size > 500].all? language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser| lang_guesser.write content lang_guesser.close_write output=lang_guesser.read if output=~/^([01]\.\d+)\t(\w+)\t(\w+)/ then score, lang, encoding = $1.to_f, $2, $3 # Language recognition isn't reliable if score is too low. lang unless score<0.9 end } {:content => content, :language => language} end private # destination method can be used by some conversion command that cannot output to stdout (example?) # a file containing plain text result will first be written by command, and then be read by extract_content. def destination require 'tmpdir' @@temp_file_as_destination ||= File.join(Dir::tmpdir,"ferret_#{Time.now.to_i}") end # Replaces generic command with specific source and destination (if specified) files def specific_command command.sub('SOURCE','"'<