module IMW
module Recordizer
class HTMLSelectorRecordizer
def self.element(*args, &block)
selector, name, delegate = parse_rule_declaration(*args, &block)
rules[name] = [selector, delegate]
attr_accessor name
name
end
def self.elements(*args, &block)
name = element(*args, &block)
rules[name] << true
end
def initialize
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
end
def self.recordize(doc)
self.new.recordize(doc)
end
def recordize(doc)
self.class.rules.each do |target, (selector, delegate, plural)|
if plural
send(target).concat doc.search(selector).map { |i| parse_result(i, delegate) }
else
send("#{target}=", parse_result(doc.at(selector), delegate))
end
end
self.to_hash
end
def to_hash
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
self.class.rules.keys.inject({}) do |hash, name|
value = send(name)
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
hash
end
end
protected
def parse_result(node, delegate)
if delegate
delegate.respond_to?(:call) ? delegate.call(node) : delegate.recordize(node)
elsif node.respond_to? :inner_text
node.inner_text
else
node
end unless node.nil?
end
private
def self.rules
@rules ||= {}
end
def self.inherited(subclass)
subclass.rules.update self.rules
end
# Rule declaration forms:
#
# { 'selector' => :property, :with => delegate }
# #=> ['selector', :property, delegate]
#
# :title
# #=> ['title', :title, nil]
def self.parse_rule_declaration(*args, &block)
options, name = Hash === args.last ? args.pop : {}, args.first
delegate = options.delete(:with)
selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
# eval block in context of a new scraper subclass
delegate = Class.new(delegate || Nibbler, &block) if block_given?
return selector, property, delegate
end
end
end
end