#!ruby -Ku #-*- encoding: utf-8 -*- class Timelog4r module HTML_Parser require 'time' require 'uri' class ParseError < StandardError; end def permission_to_sym(string) case string when '0' return :public when '1' return :friends_only when '2' return :private else return :unknown end end def tags_to_a(tag_string) end def is_group?(entry_element) group_element = 'span[@class="time"]/a[@class="name"]' end def has_parent_entry?(entry_element) end def has_child_entry?(entry_element) end def parse_children_entries(entries_element) end def parse_entry(state_element) permalink_element = 'span[@class="time"]/a[3]' memo_element = 'h3' author_element = 'span[@class="time"]/a[@class="name"]' group_element = 'h3/a[@class="name"]' permission_element = 'img[@class="icn_left"]' has_star_element = 'img[@src="http://img.timelog.jp/star.gif"]' star_count_element = has_star_element res_count_element = 'img[@src="http://img.timelog.jp/comment.gif"]' tag_list_element = 'span[@class="tag"]' reply_to_element = 'h3/a' result = nil begin result = Hash.allocate unless state_element.at(memo_element) then raise ParseError('not found memo_text.') else result[:memo_text] = state_element.at(memo_element).inner_text end unless state_element.at(permalink_element) then raise ParseError.new('not found memo_id.') else result[:memo_id] = state_element.at( permalink_element ).attr('href')[7..-1] end unless state_element.at(permission_element) then raise ParseError.new('not found permission.') else result[:permission] = state_element.at( permission_element ).attr('src').match( /icon_(public|friend)\.gif/ ) ? $1.to_sym : nil end unless state_element.at(permalink_element) then raise ParseError.new('not found modified.') else result[:modified] = state_element.at( permalink_element ).inner_text.match( /(\d+\/\d+).+(\d+:\d+)/ ) ? Time.parse( [Time.now.year.to_s+'/', $1, ' '+$2].join ) : nil end unless state_element.at(permalink_element) then raise ParseError.new('not found permalink.') else result[:permalink] = URI.parse( 'http://timelog.jp/' + state_element.at( permalink_element ).attr('href') ) end unless state_element.at(author_element) then raise ParseError('not found author.') else author = state_element.at(author_element) result[:author] = parse_author(author) end unless state_element.at(group_element) then raise ParseError.new('not found group.') else group = state_element.at(group_element) result[:in_group] = parse_group(group) end links = state_element.search(reply_to_element) links = links.reject do |link| link.has_attribute?('class') or link.has_attribute?('target') end unless links.empty? then reply_to = Hash.allocate author = Hash.allocate link = links.last author[:user_id] = link.attr( 'href' ).match( /\/\/(.+)\.timelog\.jp/ ) ? $1 : nil author[:screen_name] = link.inner_text reply_to[:author] = author result[:reply_to] = reply_to end # result[:todo] not supported. if state_element.at(tag_list_element) then tags = state_element.at(tag_list_element) result[:tag] = parse_tag_list(tags) end if state_element.at(has_star_element) then result[:star] = state_element.at( has_star_element ).attr( 'alt' ).match( /\d+/ ) ? [:count => $1.to_i] : [:count => 0] else result[:star] = [:count => 0] end unless state_element.at(res_count_element) then raise ParseError.new('not found res count.') else result[:res_count] = state_element.at( res_count_element ).inner_text.match( /\d+/ ) ? $1.to_i : 0 end # reject group name. group_name_pattern = Regexp.new( result[:in_group][:name].gsub( /[\(\[\{\/\.\|\}\)\]]/ ) { "\\"+$& } ) result[:memo_text].sub!(group_name_pattern, '') # reject reply name. reply_name_pattern = Regexp.new(/(\s>\s\w+).+$/) result[:memo_text].sub!(reply_name_pattern, '') # reject tags. tags_pattern = Regexp.new(/(\s\[.+\])$/) result[:memo_text].gsub!(tags_pattern, '') result[:memo_text].rstrip.strip.chomp! rescue ParseError => e p e return nil else return result end end def parse_author(author_element) author = Hash.allocate begin author[:user_id] = author_element.attr( 'href' ).match( /\/\/(.+)\.timelog\.jp/ ) ? $1 : nil author[:name] = author_element.inner_text[1..-2] rescue ParseError => e p e return false else return author end end def parse_group(group_element) group = Hash.allocate begin group[:group_id] = group_element.attr( 'href' ).match( /\/\/(.+)\.timelog\.jp/ ) ? $1 : nil group[:name] = group_element.inner_text rescue ParseError => e p e return nil else return group end end def parse_timeline(timeline_element) document = Mechanize::Page.new( URI.parse('http://timelog.jp/home/'), {'content-type' => 'text/html'}, timeline_element, '200', @agent ) timeline_element = 'ul#timeline' entry_element = 'div#list_1/li' permalink_element = 'span[@class="time"]/a[3]' result = { :title => document.at('title').inner_text, :link => URI.parse('http://timelog.jp/'), :modified => Time.now } begin timeline = document.search(timeline_element) raise ParseError.new('not found timeline element.') if timeline.empty? entries = timeline.search(entry_element) raise ParseError.new('not found entry elements.') if entries.empty? result[:entries] = entries.map do |entry| parse_entry(entry) end rescue ParseError => e p e return nil else return result end end def parse_tag(tag_element) end def parse_tag_list(tag_list_element) tag_element = 'a' tags = tag_list_element.search(tag_element) result = tags.map do |tag| tag.inner_text end return result end def parse_user(user_element) end def parse_user_list(user_list_element) end def parse_profile(profile_element) end end end