lib/scrapify/base.rb in scrapify-0.0.4 vs lib/scrapify/base.rb in scrapify-0.0.5

- old
+ new

@@ -1,7 +1,8 @@ module Scrapify module Base + HTTP_CACHE_HEADERS_TO_RETURN = %w(Cache-Control Last-Modified Age ETag) def self.included(klass) klass.extend ClassMethods klass.cattr_accessor :url, :doc, :attribute_names klass.instance_eval { attr_reader :attributes } end @@ -26,29 +27,59 @@ def attribute(name, options={}) add_attribute(name) parser = options[:xpath] ? :xpath : :css selector = options[parser] + matcher = /#{options[:regex]}/ if options[:regex] + to_array = options[:array] meta_define "#{name}_values" do self.doc ||= parse_html - self.doc.send(parser, selector).map &:content + self.doc.send(parser, selector).map do |element| + content = element.content + if matcher + match_data = content.scan(matcher).map &:first + options[:array] ? match_data : match_data.first + else + content.strip + end + end end end def key(attribute) define_find_by_id attribute define_count attribute end + def http_cache_header + http_header.select do |(k, v)| + HTTP_CACHE_HEADERS_TO_RETURN.map(&:upcase).include?(k.upcase) + end + end + private def add_attribute(name) self.attribute_names ||= [] self.attribute_names << name end def parse_html - Nokogiri::HTML(open(url)) + Nokogiri::HTML(html_content) + end + + def html_content + http_response.body + end + + def http_response + @http_response ||= Net::HTTP.get_response URI(url) + end + + def http_header + http_response.header.to_hash.each_with_object({}) do |(k,v), hash| + hash[k] = v.first + end end def define_finders meta_define :all do count.times.map do |index|