lib/imw/parsers/html_parser/matchers.rb in imw-0.1.0 vs lib/imw/parsers/html_parser/matchers.rb in imw-0.1.1

- old
+ new

@@ -1,306 +1,289 @@ - - - -# -# h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser -# -# == About -# -# This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt> -# abstract class and some concrete subclasses which perform specific -# kinds of matches against HTML documents using the -# Hpricot[https://code.whytheluckystiff.net/hpricot/] library. -# -# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org) -# Copyright:: Copyright (c) 2008 infochimps.org -# License:: GPL 3.0 -# Website:: http://infinitemonkeywrench.org/ -# -# puts "#{File.basename(__FILE__)}: Something clever" # at bottom - require 'imw/utils/extensions/hpricot' module IMW - module HTMLParserMatcher + module Parsers + module HtmlMatchers - # An abstract class from which to subclass specific HTML matchers. - # - # A subclass is initialized with a +selector+ and an optional - # +matcher+. The +selector+ is an HTML path specification used to - # collect elements from the document. If initialized with a - # +matcher+, the +matcher+ is used to return match information - # from the elements; else the inner HTML is returned. Subclasses - # decide how the +selector+ will collect elements. - class Matcher - - attr_accessor :selector - attr_accessor :matcher - attr_accessor :options - - def initialize selector, matcher=nil, options={} - self.selector = selector - self.matcher = matcher - self.options = options - end + # An abstract class from which to subclass specific HTML matchers. + # + # A subclass is initialized with a +selector+ and an optional + # +matcher+. The +selector+ is an HTML path specification used to + # collect elements from the document. If initialized with a + # +matcher+, the +matcher+ is used to return match information + # from the elements; else the inner HTML is returned. Subclasses + # decide how the +selector+ will collect elements. + class Matcher + + attr_accessor :selector + attr_accessor :matcher + attr_accessor :options + + def initialize selector, matcher=nil, options={} + self.selector = selector + self.matcher = matcher + self.options = options + end - def match doc - raise "Abstract class #{self.class}" + def match doc + raise "Abstract class #{self.class}" + end + end - - end - # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt> - # for matching against the first element of a document matching a - # selector. - class MatchFirstElement < Matcher - # Grab the first element from +doc+ matching the +selector+ this - # class was initialized with. If initialized with a +matcher+, - # then return the +matcher+'s match against the first element, - # else just return the inner HTML of the first element. - # - # m = MatchFirstElement.new('span#bio/a.homepage') - # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>') - # # => 'My Homepage' - def match doc - doc = Hpricot(doc) if doc.is_a?(String) - el = doc.at(selector) or return nil - if matcher - matcher.match(el) - else - options[:html] ? el.inner_html : el.inner_text.strip + # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt> + # for matching against the first element of a document matching a + # selector. + class MatchFirstElement < Matcher + # Grab the first element from +doc+ matching the +selector+ this + # class was initialized with. If initialized with a +matcher+, + # then return the +matcher+'s match against the first element, + # else just return the inner HTML of the first element. + # + # m = MatchFirstElement.new('span#bio/a.homepage') + # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>') + # # => 'My Homepage' + def match doc + doc = Hpricot(doc) if doc.is_a?(String) + el = doc.at(selector) or return nil + if matcher + matcher.match(el) + else + options[:html] ? el : el.inner_text.strip + end end end - end - # FIXME is there really a need for this separate class? why can't - # MatchFirstElement.match accept a block? - class MatchProc < MatchFirstElement - attr_accessor :proc - attr_accessor :options - def initialize selector, proc, matcher=nil, options={} - super selector, matcher - self.options = options - self.proc = proc - end - def match doc - val = super doc - val ? self.proc.call(val) : self.proc.call(doc) - end - end + # FIXME is there really a need for this separate class? why can't + # MatchFirstElement.match accept a block? + class MatchProc < MatchFirstElement + attr_accessor :proc + attr_accessor :options + def initialize selector, proc, matcher=nil, options={} + super selector, matcher + self.options = options + self.proc = proc + end + def match doc + val = super doc + val ? self.proc.call(val) : self.proc.call(doc) + end + end - # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt> - # for matching each element of a document matching a selector. - class MatchArray < Matcher - # Grab each element from +doc+ matching the +selector+ this - # class was initialized with. If initialized with a +matcher+, - # then return an array consisting of the +matcher+'s match - # against each element, else just return an array consisting of - # the inner HTML of each element. - # - # m = MatchArray.new('span#bio/a.homepage') - # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span> - # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span> - # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>') - # # => ["My Homepage", "Your Homepage", "Their Homepage"] - def match doc - doc = Hpricot(doc) if doc.is_a?(String) - subdoc = (doc/selector) or return nil - if matcher - subdoc.map{|el| matcher.match(el)} - else - if options[:html] - subdoc.map{|el| el.inner_html } + # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt> + # for matching each element of a document matching a selector. + class MatchArray < Matcher + # Grab each element from +doc+ matching the +selector+ this + # class was initialized with. If initialized with a +matcher+, + # then return an array consisting of the +matcher+'s match + # against each element, else just return an array consisting of + # the inner HTML of each element. + # + # m = MatchArray.new('span#bio/a.homepage') + # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span> + # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span> + # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>') + # # => ["My Homepage", "Your Homepage", "Their Homepage"] + def match doc + doc = Hpricot(doc) if doc.is_a?(String) + subdoc = (doc/selector) or return nil + if matcher + subdoc.map{|el| matcher.match(el)} else - subdoc.map{|el| el.inner_text.strip } + if options[:html] + subdoc.map{|el| el } + else + subdoc.map{|el| el.inner_text.strip } + end end end end - end - # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt> - # for matching an attribute of the first element of a document - # matching a selector. - class MatchAttribute < Matcher + # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt> + # for matching an attribute of the first element of a document + # matching a selector. + class MatchAttribute < Matcher - attr_accessor :attribute + attr_accessor :attribute - # Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>, - # <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized - # with three arguments: the +selector+ which collects elements - # from an HTML document, an +attribute+ to extract, and - # (optionally) a +matcher+ to perform the matching. - def initialize selector, attribute, matcher=nil - super selector, matcher - self.attribute = attribute.to_s + # Unlike <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>, + # <tt>IMW::Parsers::HtmlMatchers::MatchAttribute</tt> is initialized + # with three arguments: the +selector+ which collects elements + # from an HTML document, an +attribute+ to extract, and + # (optionally) a +matcher+ to perform the matching. + def initialize selector, attribute, matcher=nil + super selector, matcher + self.attribute = attribute.to_s + end + + # Grab the first element from +doc+ matching the +selector+ this + # class was initialized with. If initialized with a +matcher+, + # then return the +matcher+'s match against the value of the + # +attribute+ this class was initialized with, else just return + # the value of the +attribute+. + # + # m = MatchAttribute.new('span#bio/a.homepage', 'href') + # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>') + # # => 'http://foo.bar' + def match doc + doc = Hpricot(doc) if doc.is_a?(String) + val = doc.path_attr(selector, attribute) + matcher ? matcher.match(val) : val + end end - - # Grab the first element from +doc+ matching the +selector+ this - # class was initialized with. If initialized with a +matcher+, - # then return the +matcher+'s match against the value of the - # +attribute+ this class was initialized with, else just return - # the value of the +attribute+. - # - # m = MatchAttribute.new('span#bio/a.homepage', 'href') - # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>') - # # => 'http://foo.bar' - def match doc - doc = Hpricot(doc) if doc.is_a?(String) - val = doc.path_attr(selector, attribute) - matcher ? matcher.match(val) : val - end - end - # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt> - # for using a regular expression to match against text in an HTML - # document. - class MatchRegexp < Matcher - - attr_accessor :re - attr_accessor :options + # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt> + # for using a regular expression to match against text in an HTML + # document. + class MatchRegexp < Matcher + + attr_accessor :re + attr_accessor :options - # Use the regular expression +re+ to return captures from the - # elements collected by +selector+ (treated as text) used on an - # HTML document (if +selector+ is +nil+ then match against the - # full text of the document). If the keyword argument - # <tt>:capture</tt> is specified then return the corresponding - # group (indexing is that of regular expressions; "1" is the - # first capture), else return an array of all captures. If - # +matcher+, then use it on the capture(s) before returning. - # - # FIXME Shouldn't the matcher come BEFORE the regexp capture, - # not after? - def initialize selector, re, matcher=nil, options={} - super selector, matcher - self.options = options - self.re = re - end + # Use the regular expression +re+ to return captures from the + # elements collected by +selector+ (treated as text) used on an + # HTML document (if +selector+ is +nil+ then match against the + # full text of the document). If the keyword argument + # <tt>:capture</tt> is specified then return the corresponding + # group (indexing is that of regular expressions; "1" is the + # first capture), else return an array of all captures. If + # +matcher+, then use it on the capture(s) before returning. + # + # FIXME Shouldn't the matcher come BEFORE the regexp capture, + # not after? + def initialize selector, re, matcher=nil, options={} + super selector, matcher + self.options = options + self.re = re + end - # Grab the first element from +doc+ matching the +selector+ this - # object was initialized with. Use the +re+ and the (optional) - # capture group this object was initialized with to capture a - # string (or array of strings if no capture group was specified) - # from the collected element (treated as text). If initialized - # with a +matcher+, then return the +matcher+'s match against - # the value of the capture(s), else just return the capture(s). - # - # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 ) - # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>') - # # => "John Chimpo" - def match doc - doc = Hpricot(doc) if doc.is_a?(String) - el = selector ? doc.contents_of(selector) : doc - m = re.match(el.to_s) - val = case - when m.nil? then nil - when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing - else m.captures - end - # pass to matcher, if any - matcher ? matcher.match(val) : val + # Grab the first element from +doc+ matching the +selector+ this + # object was initialized with. Use the +re+ and the (optional) + # capture group this object was initialized with to capture a + # string (or array of strings if no capture group was specified) + # from the collected element (treated as text). If initialized + # with a +matcher+, then return the +matcher+'s match against + # the value of the capture(s), else just return the capture(s). + # + # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 ) + # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>') + # # => "John Chimpo" + def match doc + doc = Hpricot(doc) if doc.is_a?(String) + el = selector ? doc.contents_of(selector) : doc + m = re.match(el.to_s) + val = case + when m.nil? then nil + when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing + else m.captures + end + # pass to matcher, if any + matcher ? matcher.match(val) : val + end end - end - - class MatchRegexpRepeatedly < Matcher - attr_accessor :re - def initialize selector, re, matcher=nil - super selector, matcher - self.re = re + + class MatchRegexpRepeatedly < Matcher + attr_accessor :re + def initialize selector, re, matcher=nil + super selector, matcher + self.re = re + end + def match doc + doc = Hpricot(doc) if doc.is_a?(String) + # apply selector, if any + el = selector ? doc.contents_of(selector) : doc + return unless el + # get all matches + val = el.to_s.scan(re) + # if there's only one capture group, flatten the array + val = val.flatten if val.first && val.first.length == 1 + # pass to matcher, if any + matcher ? matcher.match(val) : val + end end - def match doc - doc = Hpricot(doc) if doc.is_a?(String) - # apply selector, if any - el = selector ? doc.contents_of(selector) : doc - return unless el - # get all matches - val = el.to_s.scan(re) - # if there's only one capture group, flatten the array - val = val.flatten if val.first && val.first.length == 1 - # pass to matcher, if any - matcher ? matcher.match(val) : val - end - end - - # Class for building a hash of values by using appropriate - # matchers against an HTML document. - class MatchHash + + # Class for building a hash of values by using appropriate + # matchers against an HTML document. + class MatchHash - attr_accessor :match_hash + attr_accessor :match_hash - # The +match_hash+ must be a +Hash+ of symbols matched to HTML - # matchers (subclasses of - # <tt>IMW::HTMLParserMatcher::Matcher</tt>). - def initialize match_hash - # Kludge? maybe. - raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash) - self.match_hash = match_hash + # The +match_hash+ must be a +Hash+ of symbols matched to HTML + # matchers (subclasses of + # <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>). + def initialize match_hash + # Kludge? maybe. + raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash) + self.match_hash = match_hash + end + + # Use the +match_hash+ this +MatchHash+ was initialized with to + # select elements from +doc+ and extract information from them: + # + # m = MatchHash.new({ + # :name => MatchFirstElement.new('li/span.customer'), + # :order_status => MatchAttribute.new('li/ul[@status]','status'), + # :products => MatchArray.new('li/ul/li') + # }) + # m.match('<li><span class="customer">John Chimpo</span> + # <ul status="shipped"> + # <li>bananas</li> + # <li>mangos</li> + # <li>banangos</li> + # </ul></li>') + # # => { + # :name => "John Chimpo", + # :order_status => "shipped", + # :products => ["bananas", "mangos", "banangos"] + # } + def match doc + doc = Hpricot(doc) if doc.is_a?(String) + hsh = { } + match_hash.each do |attr, m| + val = m.match(doc) + case attr + when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val + else hsh[attr] = val end + end + self.class.scrub!(hsh) + end + + # kill off keys with nil values + def self.scrub! hsh + hsh # .reject{|k,v| v.nil? } + end end - # Use the +match_hash+ this +MatchHash+ was initialized with to - # select elements from +doc+ and extract information from them: # - # m = MatchHash.new({ - # :name => MatchFirstElement.new('li/span.customer'), - # :order_status => MatchAttribute.new('li/ul[@status]','status'), - # :products => MatchArray.new('li/ul/li') - # }) - # m.match('<li><span class="customer">John Chimpo</span> - # <ul status="shipped"> - # <li>bananas</li> - # <li>mangos</li> - # <li>banangos</li> - # </ul></li>') - # # => { - # :name => "John Chimpo", - # :order_status => "shipped", - # :products => ["bananas", "mangos", "banangos"] - # } - def match doc - doc = Hpricot(doc) if doc.is_a?(String) + # construct the downstream part of a hash matcher + # + def self.build_match_hash spec_hash hsh = { } - match_hash.each do |attr, m| - val = m.match(doc) - case attr - when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val - else hsh[attr] = val end + spec_hash.each do |attr, spec| + hsh[attr] = build_parse_tree(spec) end - self.class.scrub!(hsh) + hsh end - - # kill off keys with nil values - def self.scrub! hsh - hsh # .reject{|k,v| v.nil? } - end - end - # - # construct the downstream part of a hash matcher - # - def self.build_match_hash spec_hash - hsh = { } - spec_hash.each do |attr, spec| - hsh[attr] = build_parse_tree(spec) - end - hsh - end - - # - # recursively build a tree of matchers - # - def self.build_parse_tree spec - case spec - when nil then nil - when Matcher then spec - when Hash then MatchHash.new(build_match_hash(spec)) - when Array then - return nil if spec.empty? - raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2) - MatchArray.new(spec[0].to_s, build_parse_tree(spec[1])) - when String then MatchFirstElement.new(spec) - when Proc then MatchProc.new(nil, spec) - when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1) - else raise "Don't know how to parse #{spec.inspect}" + # + # recursively build a tree of matchers + # + def self.build_parse_tree spec + case spec + when nil then nil + when Matcher then spec + when Hash then MatchHash.new(build_match_hash(spec)) + when Array then + return nil if spec.empty? + raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2) + MatchArray.new(spec[0].to_s, build_parse_tree(spec[1])) + when String then MatchFirstElement.new(spec) + when Proc then MatchProc.new(nil, spec) + when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1) + when Symbol then MatchAttribute.new(nil, spec, nil) + else raise "Don't know how to parse #{spec.inspect}" + end end end end end