PlainText.extract {
from :html, :htm
as "text/html"
aka "HyperText Markup Language document"
with {|source|
encoding=File.encoding(source)
if encoding.empty? or encoding.gsub(/[^\w]/,'').downcase=="utf8" then
%x{html2text -nobs "#{source}"}
else
%x{html2text -nobs "#{source}" | iconv -f #{encoding} -t utf8}
end
}
which_requires 'html2text', 'iconv'
which_should_for_example_extract 'zentrum für angewandte forschung an fachhochschulen nachhaltige energietechnik Baden-Württemberg', :from => 'zafh.net.html'
or_extract 'Málaga', :from => '7.html'
or_extract 'le monde', :from => 'lemonde.htm'
}