# =SrcFileParse # The class SrcFileParse has two subclasses, SrcTxtFileParse and SrcHtmlFileParse # It parses the file passed into it and extracts the following # from each line in the chat: time, alias, and message and/or status. module Pidgin2Adium # The two subclasses of SrcFileParse, # SrcTxtFileParse and SrcHtmlFileParse, only differ # in that they have their own @line_regex, @line_regex_status, # and most importantly, createMsgData, which takes the # +MatchData+ objects from matching against @line_regex and # fits them into hashes. class SrcFileParse def initialize(srcPath, destDirBase, masterAlias, userTZ, userTZOffset) @srcPath = srcPath # these two are to pass to chatFG in parseFile @destDirBase = destDirBase @masterAlias = masterAlias @userTZ = userTZ @userTZOffset = userTZOffset # Automagically does grouping for you. Will be inserted in @line_regex{,_status} @timestamp_regex_str = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: .{1,2})?)\)' # the first line is special: it tells us # 1) who we're talking to # 2) what time/date # 3) what SN we used # 4) what protocol (AIM, jabber...) @first_line_regex = /Conversation with (.*?) at (.*?) on (.*?) \((.*?)\)/s end # Takes the body of a line of a chat and returns the [username, status] as a 2-element array. # Example: # Pass in "Generic Screenname228 has signed off" and it returns ["Generic Screenname228", "offline"] def getAliasAndStatus(str) alias_and_status = [nil, nil] # Screen name is in regex group 1. status_map = { /(.+) logged in\.$/ => 'online', /(.+) logged out\.$/ => 'offline', /(.+) has signed on\.$/ => 'online', /(.+) has signed off\.$/ => 'offline', /(.+) has gone away\.$/ => 'away', /(.+) is no longer away\.$/ => 'available', /(.+) has become idle\.$/ => 'idle', /(.+) is no longer idle\.$/ => 'available', # file transfer /Starting transfer of .+ from (.+)/ => 'file-transfer-start', /^Offering to send .+ to (.+)$/ => 'fileTransferRequested', /(.+) is offering to send file/ => 'fileTransferRequested', } # statuses that come from my end. I totally made up these status names. my_status_map = { # encryption /^Received message encrypted with wrong key$/ => 'encrypt-error', /^Requesting key\.\.\.$/ => 'encrypt-error', /^Outgoing message lost\.$/ => 'encrypt-error', /^Conflicting Key Received!$/ => 'encrypt-error', /^Error in decryption- asking for resend\.\.\.$/ => 'encrypt-error', /^Making new key pair\.\.\.$/ => 'encrypt-key-create', # file transfer - these are in this (non-used) list because you can't get the alias out of matchData[1] /^You canceled the transfer of .+$/ => 'file-transfer-cancel', /^Transfer of file .+ complete$/ => 'fileTransferCompleted', # sending errors /^Last outgoing message not received properly- resetting$/ => 'sending-error', /^Resending\.\.\.$/ => 'sending-error', # connection errors /^Lost connection with the remote user:Remote host closed connection\.$/ => 'lost-remote-conn', # direct IM stuff /^Attempting to connect to .+ at .+ for Direct IM\./ => 'direct-im-connect', /^Asking .+ to connect to us at .+ for Direct IM\./ => 'direct-im-ask', /^Direct IM with .+ failed/ => 'direct-im-failed', /^Attempting to connect to .+\.$/ => 'direct-im-connect', /^Attempting to connect via proxy server\.$/ => 'direct-im-proxy', /^Direct IM established$/ => 'direct-im-established', /^Lost connection with the remote user:Windows socket error/ => 'direct-im-lost-conn', # chats /^.+ entered the room\.$/ => 'chat-entered-room', /^.+ left the room\.$/ => 'chat-left-room' } regex, status = status_map.detect{ |regex, status| regex.match(str) } if regex and status alias_and_status = [regex.match(str)[1], status] else # not one of the regular statuses, try my statuses. regex, status = my_status_map.detect{ |regex, status| regex.match(str) } alias_and_status = ['System Message', status] end return alias_and_status end def getTimeZoneOffset() tz_regex = /([-+]\d+)[A-Z]{3}\.(txt|html?)/ tz_match = tz_regex.match(@srcPath) tz_offset = tz_match.nil? ? @userTZOffset : tz_match[1] return tz_offset end # parseFile slurps up @srcPath into one big string and runs # SrcHtmlFileParse.cleanup if it's an HTML file. # It then uses regexes to break up the string, uses createMsgData # to turn the regex MatchData into data hashes, and feeds it to # ChatFileGenerator, which creates the XML data string. # This method returns a ChatFileGenerator object. def parseFile() fileContent = File.read(@srcPath) # one big string if self.class == SrcHtmlFileParse fileContent = self.cleanup(fileContent) end # Deal with first line. first_line_match = @first_line_regex.match(fileContent) if first_line_match.nil? Pidgin2Adium.logMsg("Parsing of #{@srcPath} failed (could not find first line).", true) return false end service = first_line_match[4] # mySN is standardized to avoid "AIM.name" and "AIM.na me" folders mySN = first_line_match[3].downcase.sub(' ', '') otherPersonsSN = first_line_match[1] chatTimePidgin_start = first_line_match[2] chatFG = ChatFileGenerator.new(service, mySN, otherPersonsSN, chatTimePidgin_start, getTimeZoneOffset(), @masterAlias, @destDirBase) all_line_matches = fileContent.scan( Regexp.union(@line_regex, @line_regex_status) ) # an empty chat window that got saved if all_line_matches.empty? return chatFG end all_line_matches.each do |line| chatFG.appendLine( createMsgData(line) ) end return chatFG end end class SrcTxtFileParse < SrcFileParse def initialize(srcPath, destDirBase, masterAlias, userTZ, userTZOffset) super(srcPath, destDirBase, masterAlias, userTZ, userTZOffset) # @line_regex matches a line in an HTML log file other than the first # @line_regex matchdata: # 0: timestamp # 1: screen name # 2: "" or nil # 3: message @line_regex = /#{@timestamp_regex_str} (.*?) ?()?: (.*)$/ # @line_regex_status matches a status line # @line_regex_status matchdata: # 0: timestamp # 1: message @line_regex_status = /#{@timestamp_regex_str} ([^:]+?)[\r\n]{1,2}/ end # createMsgData takes a +MatchData+ object (from @line_regex or @line_regex_status) and returns a hash # with the following keys: time, alias, and message and/or status. def createMsgData(matchObj) msg_data_hash = { 'time' => nil, 'alias' => nil, 'status' => nil, 'body' => nil, 'auto-reply' => nil } if matchObj[4..5] == [nil, nil] # regular message # ["10:58:29", "BuddyName", "", "hello!\r", nil, nil] msg_data_hash['time'] = matchObj[0] msg_data_hash['alias'] = matchObj[1] msg_data_hash['auto-reply'] = (matchObj[2] != nil) # strip() to remove "\r" from end msg_data_hash['body'] = matchObj[3].strip elsif matchObj[0..3] == [nil, nil, nil, nil] # status message # [nil, nil, nil, nil, "22:58:00", "BuddyName logged in."] alias_and_status = getAliasAndStatus(matchObj[5]) msg_data_hash['time'] = matchObj[4] msg_data_hash['alias'] = alias_and_status[0] msg_data_hash['status'] = alias_and_status[1] end return msg_data_hash end end class SrcHtmlFileParse < SrcFileParse def initialize(srcPath, destDirBase, masterAlias, userTZ, userTZOffset) super(srcPath, destDirBase, masterAlias, userTZ, userTZOffset) # @line_regex matches a line in an HTML log file other than the first # time matches on either "2008-11-17 14:12" or "14:12" # @line_regex match obj: # 0: timestamp, extended or not # 1: alias # 2: "<AUTO-REPLY>" or nil # 3: message body # test sms @line_regex = /#{@timestamp_regex_str} ?(.*?) ?(<AUTO-REPLY>)?:?<\/b> ?(.*)
/ #(?:[\n\r]{1,2}<(?:font|\/body))/s # @line_regex_status matches a status line # @line_regex_status match obj: # 0: timestamp # 1: status message @line_regex_status = /#{@timestamp_regex_str} ? (.*?)<\/b>/ end # createMsgData takes a +MatchData+ object (from @line_regex or @line_regex_status) and returns a hash # with the following keys: time, alias, and message and/or status. def createMsgData(matchObj) msg_data_hash = { 'time' => nil, 'alias' => nil, 'auto-reply' => nil, 'body' => nil, 'status' => nil} # the Regexp.union leaves nil where one of the regexes didn't match. # (Is there any way to have it not do this?) # ie # the first one matches: ['foo', 'bar', 'baz', 'bash', nil, nil] # second one matches: [nil, nil, nil, nil, 'bim', 'bam'] if matchObj[0..3] == [nil, nil, nil, nil] # This is a status message. # slice off results from other Regexp # becomes: ["11:27:53", "Generic Screenname228 logged in."] matchObj = matchObj[4..5] alias_and_status = getAliasAndStatus(matchObj[1]) msg_data_hash['time'] = matchObj[0] msg_data_hash['alias'] = alias_and_status[0] msg_data_hash['status'] = alias_and_status[1] elsif matchObj[4..5] == [nil, nil] # Either a regular message line or an auto-reply/away message. # slice off results from other Regexp matchObj = matchObj[0..3] msg_data_hash['time'] = matchObj[0] msg_data_hash['alias'] = matchObj[1] msg_data_hash['body'] = matchObj[3] if not matchObj[2].nil? # an auto-reply message msg_data_hash['auto-reply'] = true end end return msg_data_hash end # Removes tags, empty s, spans with either no color # information or color information that just turns the text black. # Returns a string. def cleanup(text) color_regex = /.*(color: ?#[[:alnum:]]{6}; ?).*/ # For some reason, Hpricot doesn't work well with # elem.swap(elem.innerHTML) when the elements are nested # (eg doc.search('font') only returns the outside tags, # not "font font") and also it appears that it doesn't reinterpret # the doc when outside tags are swapped with their innerHTML (so # when tags are replaced with their innerHTML, then # a search for tags in the new HTML fails). # Long story short, we use gsub. text.gsub!(/<\/?(html|body|font).*?>/, '') doc = Hpricot(text) # These empty links sometimes are appended to every line in a chat, # for some weird reason. Remove them. doc.search("a[text()='']").remove spans = doc.search('span') spans.each do |span| if span.empty? Hpricot::Elements[span].remove else # No need to check for the span.attributes.key?('style') if span[:style] =~ color_regex # Remove black-text spans after other processing because # the processing can reduce spans to that span[:style] = span[:style].gsub(color_regex, '\1'). gsub(/color: ?#000000; ?/,'') # Remove span but keep its contents span.swap(span.innerHTML) if span[:style] == '' else span.swap(span.innerHTML) end end end return doc.to_html end end end # end module