lib/imw/parsers/html_parser.rb in imw-0.1.0 vs lib/imw/parsers/html_parser.rb in imw-0.1.1
- old
+ new
@@ -189,194 +189,199 @@
#
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
require 'imw/parsers/html_parser/matchers'
-class IMW::HTMLParser
+module IMW
+ module Parsers
+ class HtmlParser
- include IMW::HTMLParserMatcher
+ include IMW::Parsers::HtmlMatchers
- attr_accessor :parse_tree
+ attr_accessor :parse_tree
- #
- # Parse Tree
- #
- def initialize arg_spec=nil
- spec = arg_spec || self.class.parser_spec
- self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
- end
+ #
+ # Parse Tree
+ #
+ def initialize arg_spec=nil
+ spec = arg_spec || self.class.parser_spec
+ self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
+ end
- #
- # See IMW::HTMLParser for syntax
- #
- #
- def self.parser_spec
- raise "Override this to create your own parser spec"
- end
+ #
+ # See IMW::HtmlParser for syntax
+ #
+ #
+ def self.parser_spec
+ raise "Override this to create your own parser spec"
+ end
- #
- # Walk
- #
- def parse doc
- self.parse_tree.match(doc)
- end
+ #
+ # Walk
+ #
+ def parse doc
+ self.parse_tree.match(doc)
+ end
- # one("hpricot_path") first match to hpricot_path
- # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
- #
- def self.one selector, matcher
- MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
- end
- # match the +attr+ attribute of the first element given by +selector+
- def self.attr selector, attr, matcher=nil
- MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
- end
- # shorthand for +attr(foo, 'href')+
- def self.href selector, matcher=nil
- self.attr(selector, 'href', matcher)
- end
- # shorthand for +attr(foo, 'src')+
- def self.src selector, matcher=nil
- self.attr(selector, 'src', matcher)
- end
+ # one("hpricot_path") first match to hpricot_path
+ # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
+ #
+ def self.one selector, matcher
+ MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
+ end
+ # match the +attr+ attribute of the first element given by +selector+
+ def self.attr selector, attr, matcher=nil
+ MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
+ end
+ # shorthand for +attr(foo, 'href')+
+ def self.href selector, matcher=nil
+ self.attr(selector, 'href', matcher)
+ end
+ # shorthand for +attr(foo, 'src')+
+ def self.src selector, matcher=nil
+ self.attr(selector, 'src', matcher)
+ end
- def self.proc selector, proc, matcher=nil
- MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
- end
+ def self.proc selector, proc, matcher=nil
+ MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
+ end
- # strip ","s (!! thus disrespecting locale !!!)
- # and convert to int
- def self.to_num selector, matcher=nil
- proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
- end
- def self.to_json selector, matcher=nil
- proc selector, lambda{|v| v.to_json if v }, matcher
- end
+ # strip ","s (!! thus disrespecting locale !!!)
+ # and convert to int
+ def self.to_num selector, matcher=nil
+ proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
+ end
+ def self.to_json selector, matcher=nil
+ proc selector, lambda{|v| v.to_json if v }, matcher
+ end
- def self.strip selector, matcher=nil
- proc selector, lambda{|v| v.strip }, matcher
- end
+ def self.strip selector, matcher=nil
+ proc selector, lambda{|v| v.strip }, matcher
+ end
- def self.re_group selector, re
- MatchRegexp.new(selector, re)
- end
- def self.re selector, re
- MatchRegexp.new(selector, re, nil, :capture => 1)
- end
- def self.re_all selector, re, matcher=nil
- MatchRegexpRepeatedly.new(selector, re)
- end
+ def self.re_group selector, re
+ MatchRegexp.new(selector, re)
+ end
+ def self.re selector, re
+ MatchRegexp.new(selector, re, nil, :capture => 1)
+ end
+ def self.re_all selector, re, matcher=nil
+ MatchRegexpRepeatedly.new(selector, re)
+ end
- # def self.plain_text selector, matcher=nil
- # proc selector, lambda{|el| el.inner_text if el }, matcher
- # end
+ # def self.plain_text selector, matcher=nil
+ # proc selector, lambda{|el| el.inner_text if el }, matcher
+ # end
- # attr_accessor :mapping
- #
- # #
- # # Feed me a hash and I'll semantify HTML
- # #
- # # The hash should magically adhere to the too-complicated,
- # # ever evolving goatrope that works for the below
- # #
- # #
- # def initialize mapping
- # self.mapping = mapping
- # end
- #
- # #
- # # take a document subtree,
- # # and a mapping of hpricot paths to that subtree's data mapping
- # # recursively extract that datamapping
- # #
- # def extract_tree hdoc, content, sub_mapping
- # data = { }
- # sub_mapping.each do |selector, target|
- # data[selector] = []
- # sub_contents = content/selector
- # sub_contents.each do |sub_content|
- # sub_data = {}
- # extract_node hdoc, sub_content, sub_data, selector, target
- # data[selector] << sub_data
- # end
- # end
- # data
- # # end
- # # if selector.is_a?(String)
- # # conts = (content)
- # # else
- # # conts = [content]
- # # end
- # # conts[0..0].each do |content|
- # # extract_node hdoc, content, data, selector, target
- # # end
- # # end
- # data
- # end
- #
- # #
- # # insert the extracted element into the data mapping
- # #
- # def extract_node hdoc, content, data, selector, target
- # classification = classify_node(selector, target)
- # result = \
- # case classification
- # when :subtree
- # target.each do |sub_selector, sub_target|
- # extract_node hdoc, content, data, sub_selector, sub_target
- # end
- #
- # when :sub_attribute
- # k, v = selector.to_a[0]
- # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
- # val = subcontent.attributes[v.to_s] if subcontent
- # data[target] = val unless val.blank?
- #
- # when :attribute then
- # val = content.attributes[selector.to_s]
- # data[target] = val unless val.blank?
- #
- # when :flatten_list
- # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
- # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
- #
- # when :inner_html
- # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
- # data[target] = subcontent.inner_html.strip if subcontent
- #
- # else
- # raise "classify_node shouldn't ever return #{classification}"
- # end
- # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
- # # puts '' if classification == :subtree
- # end
- #
- # def classify_node selector, target
- # case
- # when target.is_a?(Hash) then :subtree
- # when selector.is_a?(Hash) && (selector.length == 1) then
- # k, v = selector.to_a[0]
- # case v
- # when Symbol then :sub_attribute
- # end
- # when selector.is_a?(Symbol) then :attribute
- # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
- # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
- # else
- # raise "Can't classify mapping: " + [selector, target].join(" - ")
- # end
- # end
- #
- # # use #mapping to parse file
- # def parse link
- # begin hdoc = Hpricot(link.contents)
- # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
- # end
- #
- # # use #mapping to parse file
- # def parse_file filename
- # begin hdoc = Hpricot(File.open(filename))
- # rescue; warn "can't hpricot #{filename}" ; return false; end
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
- # end
+ # attr_accessor :mapping
+ #
+ # #
+ # # Feed me a hash and I'll semantify HTML
+ # #
+ # # The hash should magically adhere to the too-complicated,
+ # # ever evolving goatrope that works for the below
+ # #
+ # #
+ # def initialize mapping
+ # self.mapping = mapping
+ # end
+ #
+ # #
+ # # take a document subtree,
+ # # and a mapping of hpricot paths to that subtree's data mapping
+ # # recursively extract that datamapping
+ # #
+ # def extract_tree hdoc, content, sub_mapping
+ # data = { }
+ # sub_mapping.each do |selector, target|
+ # data[selector] = []
+ # sub_contents = content/selector
+ # sub_contents.each do |sub_content|
+ # sub_data = {}
+ # extract_node hdoc, sub_content, sub_data, selector, target
+ # data[selector] << sub_data
+ # end
+ # end
+ # data
+ # # end
+ # # if selector.is_a?(String)
+ # # conts = (content)
+ # # else
+ # # conts = [content]
+ # # end
+ # # conts[0..0].each do |content|
+ # # extract_node hdoc, content, data, selector, target
+ # # end
+ # # end
+ # data
+ # end
+ #
+ # #
+ # # insert the extracted element into the data mapping
+ # #
+ # def extract_node hdoc, content, data, selector, target
+ # classification = classify_node(selector, target)
+ # result = \
+ # case classification
+ # when :subtree
+ # target.each do |sub_selector, sub_target|
+ # extract_node hdoc, content, data, sub_selector, sub_target
+ # end
+ #
+ # when :sub_attribute
+ # k, v = selector.to_a[0]
+ # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
+ # val = subcontent.attributes[v.to_s] if subcontent
+ # data[target] = val unless val.blank?
+ #
+ # when :attribute then
+ # val = content.attributes[selector.to_s]
+ # data[target] = val unless val.blank?
+ #
+ # when :flatten_list
+ # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
+ # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
+ #
+ # when :inner_html
+ # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
+ # data[target] = subcontent.inner_html.strip if subcontent
+ #
+ # else
+ # raise "classify_node shouldn't ever return #{classification}"
+ # end
+ # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
+ # # puts '' if classification == :subtree
+ # end
+ #
+ # def classify_node selector, target
+ # case
+ # when target.is_a?(Hash) then :subtree
+ # when selector.is_a?(Hash) && (selector.length == 1) then
+ # k, v = selector.to_a[0]
+ # case v
+ # when Symbol then :sub_attribute
+ # end
+ # when selector.is_a?(Symbol) then :attribute
+ # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
+ # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
+ # else
+ # raise "Can't classify mapping: " + [selector, target].join(" - ")
+ # end
+ # end
+ #
+ # # use #mapping to parse file
+ # def parse link
+ # begin hdoc = Hpricot(link.contents)
+ # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
+ # end
+ #
+ # # use #mapping to parse file
+ # def parse_file filename
+ # begin hdoc = Hpricot(File.open(filename))
+ # rescue; warn "can't hpricot #{filename}" ; return false; end
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
+ # end
+ end
+ end
end
+