lib/imw/parsers/html_parser.rb in imw-0.1.0 vs lib/imw/parsers/html_parser.rb in imw-0.1.1

- old
+ new

@@ -189,194 +189,199 @@ # # puts "#{File.basename(__FILE__)}: Something clever" # at bottom require 'imw/parsers/html_parser/matchers' -class IMW::HTMLParser +module IMW + module Parsers + class HtmlParser - include IMW::HTMLParserMatcher + include IMW::Parsers::HtmlMatchers - attr_accessor :parse_tree + attr_accessor :parse_tree - # - # Parse Tree - # - def initialize arg_spec=nil - spec = arg_spec || self.class.parser_spec - self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec) - end + # + # Parse Tree + # + def initialize arg_spec=nil + spec = arg_spec || self.class.parser_spec + self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec) + end - # - # See IMW::HTMLParser for syntax - # - # - def self.parser_spec - raise "Override this to create your own parser spec" - end + # + # See IMW::HtmlParser for syntax + # + # + def self.parser_spec + raise "Override this to create your own parser spec" + end - # - # Walk - # - def parse doc - self.parse_tree.match(doc) - end + # + # Walk + # + def parse doc + self.parse_tree.match(doc) + end - # one("hpricot_path") first match to hpricot_path - # one("hpricot_path", /spec/) applies spec to first match to hpricot_path - # - def self.one selector, matcher - MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher)) - end - # match the +attr+ attribute of the first element given by +selector+ - def self.attr selector, attr, matcher=nil - MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher)) - end - # shorthand for +attr(foo, 'href')+ - def self.href selector, matcher=nil - self.attr(selector, 'href', matcher) - end - # shorthand for +attr(foo, 'src')+ - def self.src selector, matcher=nil - self.attr(selector, 'src', matcher) - end + # one("hpricot_path") first match to hpricot_path + # one("hpricot_path", /spec/) applies spec to first match to hpricot_path + # + def self.one selector, matcher + MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher)) + end + # match the +attr+ attribute of the first element given by +selector+ + def self.attr selector, attr, matcher=nil + MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher)) + end + # shorthand for +attr(foo, 'href')+ + def self.href selector, matcher=nil + self.attr(selector, 'href', matcher) + end + # shorthand for +attr(foo, 'src')+ + def self.src selector, matcher=nil + self.attr(selector, 'src', matcher) + end - def self.proc selector, proc, matcher=nil - MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher)) - end + def self.proc selector, proc, matcher=nil + MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher)) + end - # strip ","s (!! thus disrespecting locale !!!) - # and convert to int - def self.to_num selector, matcher=nil - proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher - end - def self.to_json selector, matcher=nil - proc selector, lambda{|v| v.to_json if v }, matcher - end + # strip ","s (!! thus disrespecting locale !!!) + # and convert to int + def self.to_num selector, matcher=nil + proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher + end + def self.to_json selector, matcher=nil + proc selector, lambda{|v| v.to_json if v }, matcher + end - def self.strip selector, matcher=nil - proc selector, lambda{|v| v.strip }, matcher - end + def self.strip selector, matcher=nil + proc selector, lambda{|v| v.strip }, matcher + end - def self.re_group selector, re - MatchRegexp.new(selector, re) - end - def self.re selector, re - MatchRegexp.new(selector, re, nil, :capture => 1) - end - def self.re_all selector, re, matcher=nil - MatchRegexpRepeatedly.new(selector, re) - end + def self.re_group selector, re + MatchRegexp.new(selector, re) + end + def self.re selector, re + MatchRegexp.new(selector, re, nil, :capture => 1) + end + def self.re_all selector, re, matcher=nil + MatchRegexpRepeatedly.new(selector, re) + end - # def self.plain_text selector, matcher=nil - # proc selector, lambda{|el| el.inner_text if el }, matcher - # end + # def self.plain_text selector, matcher=nil + # proc selector, lambda{|el| el.inner_text if el }, matcher + # end - # attr_accessor :mapping - # - # # - # # Feed me a hash and I'll semantify HTML - # # - # # The hash should magically adhere to the too-complicated, - # # ever evolving goatrope that works for the below - # # - # # - # def initialize mapping - # self.mapping = mapping - # end - # - # # - # # take a document subtree, - # # and a mapping of hpricot paths to that subtree's data mapping - # # recursively extract that datamapping - # # - # def extract_tree hdoc, content, sub_mapping - # data = { } - # sub_mapping.each do |selector, target| - # data[selector] = [] - # sub_contents = content/selector - # sub_contents.each do |sub_content| - # sub_data = {} - # extract_node hdoc, sub_content, sub_data, selector, target - # data[selector] << sub_data - # end - # end - # data - # # end - # # if selector.is_a?(String) - # # conts = (content) - # # else - # # conts = [content] - # # end - # # conts[0..0].each do |content| - # # extract_node hdoc, content, data, selector, target - # # end - # # end - # data - # end - # - # # - # # insert the extracted element into the data mapping - # # - # def extract_node hdoc, content, data, selector, target - # classification = classify_node(selector, target) - # result = \ - # case classification - # when :subtree - # target.each do |sub_selector, sub_target| - # extract_node hdoc, content, data, sub_selector, sub_target - # end - # - # when :sub_attribute - # k, v = selector.to_a[0] - # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k)) - # val = subcontent.attributes[v.to_s] if subcontent - # data[target] = val unless val.blank? - # - # when :attribute then - # val = content.attributes[selector.to_s] - # data[target] = val unless val.blank? - # - # when :flatten_list - # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector) - # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html } - # - # when :inner_html - # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector)) - # data[target] = subcontent.inner_html.strip if subcontent - # - # else - # raise "classify_node shouldn't ever return #{classification}" - # end - # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute) - # # puts '' if classification == :subtree - # end - # - # def classify_node selector, target - # case - # when target.is_a?(Hash) then :subtree - # when selector.is_a?(Hash) && (selector.length == 1) then - # k, v = selector.to_a[0] - # case v - # when Symbol then :sub_attribute - # end - # when selector.is_a?(Symbol) then :attribute - # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list - # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html - # else - # raise "Can't classify mapping: " + [selector, target].join(" - ") - # end - # end - # - # # use #mapping to parse file - # def parse link - # begin hdoc = Hpricot(link.contents) - # rescue; warn "can't hpricot #{link.to_s}" ; return false; end - # raw_taggings = extract_tree hdoc, hdoc, self.mapping - # end - # - # # use #mapping to parse file - # def parse_file filename - # begin hdoc = Hpricot(File.open(filename)) - # rescue; warn "can't hpricot #{filename}" ; return false; end - # raw_taggings = extract_tree hdoc, hdoc, self.mapping - # end + # attr_accessor :mapping + # + # # + # # Feed me a hash and I'll semantify HTML + # # + # # The hash should magically adhere to the too-complicated, + # # ever evolving goatrope that works for the below + # # + # # + # def initialize mapping + # self.mapping = mapping + # end + # + # # + # # take a document subtree, + # # and a mapping of hpricot paths to that subtree's data mapping + # # recursively extract that datamapping + # # + # def extract_tree hdoc, content, sub_mapping + # data = { } + # sub_mapping.each do |selector, target| + # data[selector] = [] + # sub_contents = content/selector + # sub_contents.each do |sub_content| + # sub_data = {} + # extract_node hdoc, sub_content, sub_data, selector, target + # data[selector] << sub_data + # end + # end + # data + # # end + # # if selector.is_a?(String) + # # conts = (content) + # # else + # # conts = [content] + # # end + # # conts[0..0].each do |content| + # # extract_node hdoc, content, data, selector, target + # # end + # # end + # data + # end + # + # # + # # insert the extracted element into the data mapping + # # + # def extract_node hdoc, content, data, selector, target + # classification = classify_node(selector, target) + # result = \ + # case classification + # when :subtree + # target.each do |sub_selector, sub_target| + # extract_node hdoc, content, data, sub_selector, sub_target + # end + # + # when :sub_attribute + # k, v = selector.to_a[0] + # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k)) + # val = subcontent.attributes[v.to_s] if subcontent + # data[target] = val unless val.blank? + # + # when :attribute then + # val = content.attributes[selector.to_s] + # data[target] = val unless val.blank? + # + # when :flatten_list + # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector) + # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html } + # + # when :inner_html + # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector)) + # data[target] = subcontent.inner_html.strip if subcontent + # + # else + # raise "classify_node shouldn't ever return #{classification}" + # end + # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute) + # # puts '' if classification == :subtree + # end + # + # def classify_node selector, target + # case + # when target.is_a?(Hash) then :subtree + # when selector.is_a?(Hash) && (selector.length == 1) then + # k, v = selector.to_a[0] + # case v + # when Symbol then :sub_attribute + # end + # when selector.is_a?(Symbol) then :attribute + # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list + # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html + # else + # raise "Can't classify mapping: " + [selector, target].join(" - ") + # end + # end + # + # # use #mapping to parse file + # def parse link + # begin hdoc = Hpricot(link.contents) + # rescue; warn "can't hpricot #{link.to_s}" ; return false; end + # raw_taggings = extract_tree hdoc, hdoc, self.mapping + # end + # + # # use #mapping to parse file + # def parse_file filename + # begin hdoc = Hpricot(File.open(filename)) + # rescue; warn "can't hpricot #{filename}" ; return false; end + # raw_taggings = extract_tree hdoc, hdoc, self.mapping + # end + end + end end +