# HtmlLogParser class, a subclass of BasicParser.
# Used for parse()ing HTML logs.
require 'balance_tags_c'
module Pidgin2Adium
class HtmlLogParser < BasicParser
def initialize(src_path, user_aliases)
super(src_path, user_aliases)
@timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)'
# @line_regex matches a line in an HTML log file other than the
# first time matches on either "2008-11-17 14:12" or "14:12"
# @line_regex match obj:
# 0: timestamp, extended or not
# 1: screen name or alias, if alias set
# 2: "<AUTO-REPLY>" or nil
# 3: message body
# The ":" is optional to allow for strings like "(17:12:21) ***Gabe B-W is confused
"
@line_regex = /#{@timestamp_rx} ?(.+?) ?(<AUTO-REPLY>)?:?<\/b> ?(.+)
/o
# @line_regex_status matches a status line
# @line_regex_status match obj:
# 0: timestamp
# 1: status message
@line_regex_status = /#{@timestamp_rx} ? (.+)<\/b>
/o
end
# Returns a cleaned string.
# Removes the following tags from _text_:
# * html
# * body
# * font
# * a with no innertext, e.g.
# And removes the following style declarations:
# * color: #000000 (just turns text black)
# * font-family
# * font-size
# * background
# * em (really it's changed to )
# Since each has only one style declaration, spans with these
# declarations are removed (but the text inside them is preserved).
def cleanup(text)
# Sometimes this is in there. I don't know why.
text.gsub!(%r{</FONT HSPACE='\d'>}, '')
# We can remove safely since Pidgin and Adium both show bold
# using except Pidgin uses single
# quotes while Adium uses double quotes.
text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important!
text.tr!("\r", '')
# Remove empty lines
text.gsub!("\n\n", "\n")
# Remove newlines that end the file, since they screw up the
# newline ->
conversion
text.gsub!(/\n\Z/, '')
# Replace newlines with "
" unless they end a chat line.
# This must go after we remove tags.
text.gsub!(/\n(?!#{@timestamp_rx})/, '
')
# These empty links are sometimes appended to every line in a chat,
# for some weird reason. Remove them.
text.gsub!(%r{\s*?}, '')
# Replace single quotes inside tags with double quotes so we can
# easily change single quotes to entities.
# For spans, removes a space after the final declaration if it exists.
text.gsub!(//, '')
text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"')
=begin
text.gsub!(//, '')
text.gsub!(/(.*?)}) do |s|
text.gsub!(%r{(.*?)}) do |s|
# Remove empty spans.
next if $2 == ''
# style = style declaration
# innertext = text inside
style, innertext = $1, $2
# TODO: replace double quotes with """, but only outside tags; may still be tags inside spans
# innertext.gsub!("")
styleparts = style.split(/; ?/)
styleparts.map! do |p|
if p[0,5] == 'color'
if p.include?('color: #000000')
next
elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/
# Regarding the bit with the ">", sometimes this happens:
# today was busy
# Then p = "color: #000000>today"
# Or it can end in ">;", with no text before the semicolon.
# So keep the color but remove the ">" and anything following it.
next($1)
end
else
# don't remove font-weight
case p
when /^font-family/ then next
when /^font-size/ then next
when /^background/ then next
end
end
end.compact!
unless styleparts.empty?
style = styleparts.join('; ')
innertext = "#{innertext}"
end
innertext
end
# Pidgin uses , Adium uses
if text.gsub!('', '')
text.gsub!('', '')
end
return text
end
end # END HtmlLogParser class
end