# HtmlLogParser class, a subclass of BasicParser. # Used for parse()ing HTML logs. require 'balance_tags_c' module Pidgin2Adium class HtmlLogParser < BasicParser def initialize(src_path, user_aliases) super(src_path, user_aliases) @timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)' # @line_regex matches a line in an HTML log file other than the # first time matches on either "2008-11-17 14:12" or "14:12" # @line_regex match obj: # 0: timestamp, extended or not # 1: screen name or alias, if alias set # 2: "<AUTO-REPLY>" or nil # 3: message body # The ":" is optional to allow for strings like "(17:12:21) ***Gabe B-W is confused
" @line_regex = /#{@timestamp_rx} ?(.+?) ?(<AUTO-REPLY>)?:?<\/b> ?(.+)
/o # @line_regex_status matches a status line # @line_regex_status match obj: # 0: timestamp # 1: status message @line_regex_status = /#{@timestamp_rx} ? (.+)<\/b>
/o end # Returns a cleaned string. # Removes the following tags from _text_: # * html # * body # * font # * a with no innertext, e.g. # And removes the following style declarations: # * color: #000000 (just turns text black) # * font-family # * font-size # * background # * em (really it's changed to ) # Since each has only one style declaration, spans with these # declarations are removed (but the text inside them is preserved). def cleanup(text) # Sometimes this is in there. I don't know why. text.gsub!(%r{</FONT HSPACE='\d'>}, '') # We can remove safely since Pidgin and Adium both show bold # using except Pidgin uses single # quotes while Adium uses double quotes. text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important! text.tr!("\r", '') # Remove empty lines text.gsub!("\n\n", "\n") # Remove newlines that end the file, since they screw up the # newline ->
conversion text.gsub!(/\n\Z/, '') # Replace newlines with "
" unless they end a chat line. # This must go after we remove tags. text.gsub!(/\n(?!#{@timestamp_rx})/, '
') # These empty links are sometimes appended to every line in a chat, # for some weird reason. Remove them. text.gsub!(%r{\s*?}, '') # Replace single quotes inside tags with double quotes so we can # easily change single quotes to entities. # For spans, removes a space after the final declaration if it exists. text.gsub!(//, '') text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"') =begin text.gsub!(//, '') text.gsub!(/([^(.*?)}) do |s| text.gsub!(%r{(.*?)}) do |s| # Remove empty spans. next if $2 == '' # style = style declaration # innertext = text inside style, innertext = $1, $2 # TODO: replace double quotes with """, but only outside tags; may still be tags inside spans # innertext.gsub!("") styleparts = style.split(/; ?/) styleparts.map! do |p| if p[0,5] == 'color' if p.include?('color: #000000') next elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/ # Regarding the bit with the ">", sometimes this happens: # today was busy # Then p = "color: #000000>today" # Or it can end in ">;", with no text before the semicolon. # So keep the color but remove the ">" and anything following it. next($1) end else # don't remove font-weight case p when /^font-family/ then next when /^font-size/ then next when /^background/ then next end end end.compact! unless styleparts.empty? style = styleparts.join('; ') innertext = "#{innertext}" end innertext end # Pidgin uses , Adium uses if text.gsub!('', '') text.gsub!('', '') end return text end end # END HtmlLogParser class end