# for URI::regexp require 'uri' require 'feedparser/html2text-parser' # This class provides various converters class String # is this text HTML ? search for tags. used by String#text2html def html? return (self =~ /
/i) || (self =~ /<\/p>/i) || (self =~ / \1
/i) || (self =~ /
/i) || (self =~ /<\/a>/i) || (self =~ /
")
# uris
text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
'\1')
end
# Handle broken hrefs in and
if feed and feed.link
text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m|
begin
first, url, last = $1, $3, $4
if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/)
m
elsif url =~ /^\//
(first + feed.link.split(/\//)[0..2].join('/') + url + last)
else
t = feed.link.split(/\//)
if t.length == 3 # http://toto with no trailing /
(first + feed.link + '/' + url + last)
else
if feed.link =~ /\/$/
(first + feed.link + url + last)
else
(first + t[0...-1].join('/') + '/' + url + last)
end
end
end
rescue
m
end
end
end
text
end
# Remove white space around the text
def rmWhiteSpace!
return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
end
# Convert a text in inputenc to a text in UTF8
# must take care of wrong input locales
def toUTF8(inputenc)
if inputenc.downcase != 'utf-8'
# it is said it is not UTF-8. Ensure it is REALLY not UTF-8
begin
if self.unpack('U*').pack('U*') == self
return self
end
rescue
# do nothing
end
begin
return self.unpack('C*').pack('U*')
rescue
return self #failsafe solution. but a dirty one :-)
end
else
return self
end
end
end