# Contains the class BasicParser and its subclasses, HtmlLogParser and
# TextFileParser, which parse the file passed into it and return a LogFile
# object.
#
# Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
# using these classes directly.
require 'parsedate'
require 'time' # for Time.zone_offset
require 'balance_tags_c'
require 'pidgin2adium/log_file'
module Pidgin2Adium
# Empty class. Raise'd by LogParser if the first line of a log is not
# parseable.
class InvalidFirstLineError < StandardError; end
# BasicParser is a base class. Its subclasses are TextLogParser and
# HtmlLogParser.
#
# Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
# using this class directly.
class BasicParser
include Pidgin2Adium
def initialize(src_path, user_aliases)
@src_path = src_path
# Whitespace is removed for easy matching later on.
@user_aliases = user_aliases.split(',').map!{|x| x.downcase.gsub(/\s+/,'') }.uniq
# @user_alias is set each time get_sender_by_alias is called. It is a non-normalized
# alias.
# Set an initial value just in case the first message doesn't give
# us an alias.
@user_alias = user_aliases.split(',')[0]
@tz_offset = get_time_zone_offset()
file = File.new(@src_path, 'r')
@first_line = file.readline
@file_content = file.read
file.close
# Time regexes must be set before pre_parse().
# "4/18/2007 11:02:00 AM" => %w{4, 18, 2007, 11, 02, 00, AM}
# ONLY used (if at all) in first line of chat ("Conversation with...at...")
@time_regex_first_line = %r{^(\d{1,2})/(\d{1,2})/(\d{4}) (\d{1,2}):(\d{2}):(\d{2}) ([AP]M)$}
# "2007-04-17 12:33:13" => %w{2007, 04, 17, 12, 33, 13}
@time_regex = /^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})$/
# sometimes a line in a chat doesn't have a full timestamp
# "04:22:05 AM" => %w{04 22 05 AM}
@minimal_time_regex = /^(\d{1,2}):(\d{2}):(\d{2})( [AP]M)?$/
# Whether or not the first line is parseable.
@first_line_is_valid = true
begin
@service,
@user_SN,
@partner_SN,
# @basic_time_info is for files that only have the full
# timestamp at the top; we can use it to fill in the minimal
# per-line timestamps. It has only 3 elements (year, month,
# dayofmonth) because you should be able to fill everything
# else in. If you can't, something's wrong.
@basic_time_info,
# When the chat started, in Adium's format
@adium_chat_time_start = pre_parse()
rescue InvalidFirstLineError
@first_line_is_valid = false
error("Failed to parse, invalid first line: #{@src_path}")
return # stop processing
end
# @status_map, @lib_purple_events, and @events are used in
# create_status_or_event_msg
@status_map = {
/(.+) logged in\.$/ => 'online',
/(.+) logged out\.$/ => 'offline',
/(.+) has signed on\.$/ => 'online',
/(.+) has signed off\.$/ => 'offline',
/(.+) has gone away\.$/ => 'away',
/(.+) is no longer away\.$/ => 'available',
/(.+) has become idle\.$/ => 'idle',
/(.+) is no longer idle\.$/ => 'available'
}
# lib_purple_events are all of event_type libPurple
@lib_purple_events = [
# file transfer
/Starting transfer of .+ from (.+)/,
/^Offering to send .+ to (.+)$/,
/(.+) is offering to send file/,
/^Transfer of file .+ complete$/,
/Error reading|writing|accessing .+: .+/,
/You cancell?ed the transfer of/,
/File transfer cancelled/,
/(.+?) cancell?ed the transfer of/,
/(.+?) cancelled the file transfer/,
# Direct IM - actual (dis)connect events are their own types
/^Attempting to connect to (.+) at .+ for Direct IM\./,
/^Asking (.+) to connect to us at .+ for Direct IM\./,
/^Attempting to connect via proxy server\.$/,
/^Direct IM with (.+) failed/,
# encryption
/Received message encrypted with wrong key/,
/^Requesting key\.\.\.$/,
/^Outgoing message lost\.$/,
/^Conflicting Key Received!$/,
/^Error in decryption- asking for resend\.\.\.$/,
/^Making new key pair\.\.\.$/,
# sending errors
/^Last outgoing message not received properly- resetting$/,
/Resending\.\.\./,
# connection errors
/Lost connection with the remote user:.+/,
# chats
/^.+ entered the room\.$/,
/^.+ left the room\.$/
]
# non-libpurple events
# Each key maps to an event_type string. The keys will be matched against a line of chat
# and the partner's alias will be in regex group 1, IF the alias is matched.
@event_map = {
# .+ is not an alias, it's a proxy server so no grouping
/^Attempting to connect to .+\.$/ => 'direct-im-connect',
# NB: pidgin doesn't track when Direct IM is disconnected, AFAIK
/^Direct IM established$/ => 'directIMConnected',
/Unable to send message/ => 'chat-error',
/You missed .+ messages from (.+) because they were too large/ => 'chat-error',
/User information not available/ => 'chat-error'
}
@ignore_events = [
# Adium ignores SN/alias changes.
/^.+? is now known as .+?\.
$/
]
end
# This method returns a LogFile instance, or false if an error occurred.
def parse
return false unless @first_line_is_valid
@file_content = cleanup(@file_content).split("\n")
@file_content.map! do |line|
# "next" returns nil which is removed by compact
next if line =~ /^\s+$/
if line =~ @line_regex
create_msg($~.captures)
elsif line =~ @line_regex_status
msg = create_status_or_event_msg($~.captures)
# Error occurred while parsing
return false if msg == false
else
error "Could not parse line:"
p line
return false
end
end
@file_content.compact!
return LogFile.new(@file_content, @service, @user_SN, @partner_SN, @adium_chat_time_start)
end
# Prevent parse from being called directly from BasicParser, since
# it uses subclassing magic.
protected :parse
#################
private
#################
def get_time_zone_offset()
# We must have a tz_offset or else the Adium Chat Log viewer
# doesn't read the date correctly and then:
# 1) the log has an empty start date column in the viewer
# 2) The timestamps are all the same for the whole log
tz_match = /([-\+]\d+)[A-Z]{3}\.(?:txt|htm|html)/.match(@src_path)
if tz_match and tz_match[1]
tz_offset = tz_match[1]
else
# "-0500" (3d rather than 2d to allow for "+")
tz_offset = sprintf('%+03d00', Time.zone_offset(Time.now.zone) / 3600)
end
return tz_offset
end
#--
# Adium time format: YYYY-MM-DD\THH:MM:SS[+-]TZ_HRS like:
# 2008-10-05T22:26:20-0800
# HOWEVER:
# If it's the first line, then return it like this (note periods):
# 2008-10-05T22.26.20-0800
# because it will be used in the filename.
#++
# Converts a pidgin datestamp to an Adium one.
def create_adium_time(time, is_first_line = false)
# parsed_date = [year, month, day, hour, min, sec]
if time =~ @time_regex
year, month, day, hour, min, sec = $1.to_i,
$2.to_i,
$3.to_i,
$4.to_i,
$5.to_i,
$6.to_i
elsif is_first_line and time =~ @time_regex_first_line
hour = $4.to_i
if $7 == 'PM' and hour != 12
hour += 12
end
year, month, day, min, sec = $3.to_i, # year
$1.to_i, # month
$2.to_i, # day
# already did hour
$5.to_i, # minutes
$6.to_i # seconds
elsif time =~ @minimal_time_regex
# "04:22:05" => %w{04 22 05}
hour = $1.to_i
if $4 == 'PM' and hour != 12
hour += 12
end
year, month, day = @basic_time_info
min = $2.to_i
sec = $3.to_i
else
error("You have found an odd timestamp. Please report it to the developer.")
log_msg("The timestamp: #{time}")
log_msg("Continuing...")
year,month,day,hour,min,sec = ParseDate.parsedate(time)
end
if is_first_line
adium_time = Time.local(year,month,day,hour,min,sec).strftime("%Y-%m-%dT%H.%M.%S#{@tz_offset}")
else
adium_time = Time.local(year,month,day,hour,min,sec).strftime("%Y-%m-%dT%H:%M:%S#{@tz_offset}")
end
return adium_time
end
# Extract required data from the file. Run by parse.
def pre_parse
# Deal with first line.
# the first line is special. It tells us (in order of regex groups):
# 1) who we're talking to
# 2) what time/date
# 3) what SN we used
# 4) what protocol (AIM, icq, jabber...)
first_line_match = /Conversation with (.+?) at (.+?) on (.+?) \((.+?)\)/.match(@first_line)
if first_line_match.nil?
raise InvalidFirstLineError
else
service = first_line_match[4]
# @user_SN is normalized to avoid "AIM.name" and "AIM.na me" folders
user_SN = first_line_match[3].downcase.tr(' ', '')
partner_SN = first_line_match[1]
pidgin_chat_time_start = first_line_match[2]
basic_time_info = case pidgin_chat_time_start
when @time_regex: [$1.to_i, $2.to_i, $3.to_i]
when @time_regex_first_line: [$3.to_i, $1.to_i, $2.to_i]
end
adium_chat_time_start = create_adium_time(pidgin_chat_time_start, true)
return [service,
user_SN,
partner_SN,
basic_time_info,
adium_chat_time_start]
end
end
def get_sender_by_alias(alias_name)
no_action = alias_name.sub(/^\*{3}/, '')
if @user_aliases.include? no_action.downcase.gsub(/\s+/, '')
# Set the current alias being used of the ones in @user_aliases
@user_alias = no_action
return @user_SN
else
return @partner_SN
end
end
#--
# create_msg takes an array of captures from matching against
# @line_regex and returns a Message object or one of its subclasses.
# It can be used for TextLogParser and HtmlLogParser because both of
# them return data in the same indexes in the matches array.
#++
def create_msg(matches)
msg = nil
# Either a regular message line or an auto-reply/away message.
time = create_adium_time(matches[0])
buddy_alias = matches[1]
sender = get_sender_by_alias(buddy_alias)
body = matches[3]
if matches[2] # auto-reply
msg = AutoReplyMessage.new(sender, time, buddy_alias, body)
else
# normal message
msg = XMLMessage.new(sender, time, buddy_alias, body)
end
return msg
end
#--
# create_status_or_event_msg takes an array of +MatchData+ captures from
# matching against @line_regex_status and returns an Event or Status.
# Returns nil if it's a message that should be ignored, or false if an
# error occurred.
#++
def create_status_or_event_msg(matches)
# ["22:58:00", "BuddyName logged in."]
# 0: time
# 1: status message or event
msg = nil
time = create_adium_time(matches[0])
str = matches[1]
# Return nil, which will get compact'ed out
return nil if @ignore_events.detect{|regex| str =~ regex }
regex, status = @status_map.detect{|regex, status| str =~ regex}
if regex and status
# Status message
buddy_alias = regex.match(str)[1]
sender = get_sender_by_alias(buddy_alias)
msg = StatusMessage.new(sender, time, buddy_alias, status)
else
# Test for event
regex = @lib_purple_events.detect{|regex| str =~ regex }
event_type = 'libpurpleEvent' if regex
unless regex and event_type
# not a libpurple event, try others
if @event_map.detect{|regex,event_type| str =~ regex}
regex, event_type = $1, $2
else
error(sprintf("Error parsing status or event message, no status or event found: %p", str))
return false
end
end
if regex and event_type
regex_matches = regex.match(str)
# Event message
if regex_matches.size == 1
# No alias - this means it's the user
buddy_alias = @user_alias
sender = @user_SN
else
buddy_alias = regex_matches[1]
sender = get_sender_by_alias(buddy_alias)
end
msg = Event.new(sender, time, buddy_alias, str, event_type)
end
end
return msg
end
end
# Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead of
# using this class directly.
class TextLogParser < BasicParser
def initialize(src_path, user_aliases)
super(src_path, user_aliases)
@timestamp_rx = '\((\d{1,2}:\d{1,2}:\d{1,2})\)'
# @line_regex matches a line in a TXT log file other than the first
# @line_regex matchdata:
# 0: timestamp
# 1: screen name or alias, if alias set
# 2: "" or nil
# 3: message body
@line_regex = /#{@timestamp_rx} (.*?) ?()?: (.*)/o
# @line_regex_status matches a status line
# @line_regex_status matchdata:
# 0: timestamp
# 1: status message
@line_regex_status = /#{@timestamp_rx} ([^:]+)/o
end
public :parse
#################
private
#################
def cleanup(text)
text.tr!("\r", '')
# Replace newlines with "
" unless they end a chat line.
text.gsub!(/\n(?!#{@timestamp_rx}|\Z)/, '
')
# Escape entities since this will be in XML
text.gsub!('&', '&') # escape '&' first
text.gsub!('<', '<')
text.gsub!('>', '>')
text.gsub!('"', '"')
text.gsub!("'", ''')
return text
end
end
# Please use Pidgin2Adium.parse or Pidgin2Adium.parse_and_generate instead
# of using this class directly.
class HtmlLogParser < BasicParser
def initialize(src_path, user_aliases)
super(src_path, user_aliases)
@timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)'
# @line_regex matches a line in an HTML log file other than the
# first time matches on either "2008-11-17 14:12" or "14:12"
# @line_regex match obj:
# 0: timestamp, extended or not
# 1: screen name or alias, if alias set
# 2: "<AUTO-REPLY>" or nil
# 3: message body
# The ":" is optional to allow for strings like "(17:12:21) ***Gabe B-W is confused
"
@line_regex = /#{@timestamp_rx} ?(.+?) ?(<AUTO-REPLY>)?:?<\/b> ?(.+)
/o
# @line_regex_status matches a status line
# @line_regex_status match obj:
# 0: timestamp
# 1: status message
@line_regex_status = /#{@timestamp_rx} ? (.+)<\/b>
/o
end
public :parse
#################
private
#################
# Returns a cleaned string.
# Removes the following tags from _text_:
# * html
# * body
# * font
# * a with no innertext, e.g.
# And removes the following style declarations:
# * color: #000000 (just turns text black)
# * font-family
# * font-size
# * background
# * em (really it's changed to )
# Since each has only one style declaration, spans with these
# declarations are removed (but the text inside them is preserved).
def cleanup(text)
# Sometimes this is in there. I don't know why.
text.gsub!(%r{</FONT HSPACE='\d'>}, '')
# We can remove safely since Pidgin and Adium both show bold
# using except Pidgin uses single
# quotes while Adium uses double quotes.
text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important!
text.tr!("\r", '')
# Remove empty lines
text.gsub!("\n\n", "\n")
# Remove newlines that end the file, since they screw up the
# newline ->
conversion
text.gsub!(/\n\Z/, '')
# Replace newlines with "
" unless they end a chat line.
# This must go after we remove tags.
text.gsub!(/\n(?!#{@timestamp_rx})/, '
')
# These empty links are sometimes appended to every line in a chat,
# for some weird reason. Remove them.
text.gsub!(%r{\s*?}, '')
# Replace single quotes inside tags with double quotes so we can
# easily change single quotes to entities.
# For spans, removes a space after the final declaration if it exists.
text.gsub!(//, '')
text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"')
=begin
text.gsub!(//, '')
text.gsub!(/(.*?)}) do |s|
text.gsub!(%r{(.*?)}) do |s|
# Remove empty spans.
next if $2 == ''
# style = style declaration
# innertext = text inside
style, innertext = $1, $2
# TODO: replace double quotes with """, but only outside tags; may still be tags inside spans
# innertext.gsub!("")
styleparts = style.split(/; ?/)
styleparts.map! do |p|
if p[0,5] == 'color'
if p.include?('color: #000000')
next
elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/
# Regarding the bit with the ">", sometimes this happens:
# today was busy
# Then p = "color: #000000>today"
# Or it can end in ">;", with no text before the semicolon.
# So keep the color but remove the ">" and anything following it.
next($1)
end
else
# don't remove font-weight
case p
when /^font-family/: next
when /^font-size/: next
when /^background/: next
end
end
end.compact!
unless styleparts.empty?
style = styleparts.join('; ')
innertext = "#{innertext}"
end
innertext
end
# Pidgin uses , Adium uses
if text.gsub!('', '')
text.gsub!('', '')
end
return text
end
end
# A holding object for each line of the chat. It is subclassed as
# appropriate (eg AutoReplyMessage). Each subclass (but not Message
# itself) has its own to_s which prints out its information in a format
# appropriate for putting in an Adium log file.
# Subclasses: XMLMessage, AutoReplyMessage, StatusMessage, Event.
class Message
def initialize(sender, time, buddy_alias)
# The sender's screen name
@sender = sender
# The time the message was sent, in Adium format (e.g.
# "2008-10-05T22:26:20-0800")
@time = time
# The receiver's alias (NOT screen name)
@buddy_alias = buddy_alias
end
attr_accessor :sender, :time, :buddy_alias
end
# Basic message with body text (as opposed to pure status messages, which
# have no body).
class XMLMessage < Message
def initialize(sender, time, buddy_alias, body)
super(sender, time, buddy_alias)
@body = body
@styled_body = '%s
' % @body
normalize_body!()
end
attr_accessor :body
def to_s
return sprintf('%s' << "\n",
@sender, @time, @buddy_alias, @styled_body)
end
#################
private
#################
# Balances mismatched tags, normalizes body style, and fixes actions
# so they are in Adium style (Pidgin uses "***Buddy waves at you", Adium uses
# "*Buddy waves at you*").
def normalize_body!
normalize_body_entities!()
# Fix mismatched tags. Yes, it's faster to do it per-message
# than all at once.
@body = Pidgin2Adium.balance_tags_c(@body)
if @buddy_alias[0,3] == '***'
# "***" is what pidgin sets as the alias for a /me action
@buddy_alias.slice!(0,3)
@body = '*' << @body << '*'
end
end
# Escapes entities.
def normalize_body_entities!
# Convert '&' to '&' only if it's not followed by an entity.
@body.gsub!(/&(?!lt|gt|amp|quot|apos)/, '&')
end
end
# An auto reply message.
class AutoReplyMessage < XMLMessage
def to_s
return sprintf('%s' << "\n",
@sender, @time, @buddy_alias, @styled_body)
end
end
# A message saying e.g. "Blahblah has gone away."
class StatusMessage < Message
def initialize(sender, time, buddy_alias, status)
super(sender, time, buddy_alias)
@status = status
end
attr_accessor :status
def to_s
return sprintf('' << "\n", @status, @sender, @time, @buddy_alias)
end
end
# Pidgin does not have Events, but Adium does. Pidgin mostly uses system
# messages to display what Adium calls events. These include sending a file,
# starting a Direct IM connection, or an error in chat.
class Event < XMLMessage
def initialize(sender, time, buddy_alias, body, event_type)
super(sender, time, buddy_alias, body)
@event_type = event_type
end
attr_accessor :event_type
def to_s
return sprintf('%s',
@event_type, @sender, @time, @buddy_alias, @styled_body)
end
end
end # end module