lib/scrapify/base.rb in scrapify-0.0.4 vs lib/scrapify/base.rb in scrapify-0.0.5
- old
+ new
@@ -1,7 +1,8 @@
module Scrapify
module Base
+ HTTP_CACHE_HEADERS_TO_RETURN = %w(Cache-Control Last-Modified Age ETag)
def self.included(klass)
klass.extend ClassMethods
klass.cattr_accessor :url, :doc, :attribute_names
klass.instance_eval { attr_reader :attributes }
end
@@ -26,29 +27,59 @@
def attribute(name, options={})
add_attribute(name)
parser = options[:xpath] ? :xpath : :css
selector = options[parser]
+ matcher = /#{options[:regex]}/ if options[:regex]
+ to_array = options[:array]
meta_define "#{name}_values" do
self.doc ||= parse_html
- self.doc.send(parser, selector).map &:content
+ self.doc.send(parser, selector).map do |element|
+ content = element.content
+ if matcher
+ match_data = content.scan(matcher).map &:first
+ options[:array] ? match_data : match_data.first
+ else
+ content.strip
+ end
+ end
end
end
def key(attribute)
define_find_by_id attribute
define_count attribute
end
+ def http_cache_header
+ http_header.select do |(k, v)|
+ HTTP_CACHE_HEADERS_TO_RETURN.map(&:upcase).include?(k.upcase)
+ end
+ end
+
private
def add_attribute(name)
self.attribute_names ||= []
self.attribute_names << name
end
def parse_html
- Nokogiri::HTML(open(url))
+ Nokogiri::HTML(html_content)
+ end
+
+ def html_content
+ http_response.body
+ end
+
+ def http_response
+ @http_response ||= Net::HTTP.get_response URI(url)
+ end
+
+ def http_header
+ http_response.header.to_hash.each_with_object({}) do |(k,v), hash|
+ hash[k] = v.first
+ end
end
def define_finders
meta_define :all do
count.times.map do |index|