lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.4.0 vs lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.5.0
- old
+ new
@@ -1,16 +1,18 @@
module RegexpCrawler
class Crawler
- attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method
+ attr_accessor :start_page, :continue_regexp, :named_captures, :model, :save_method, :headers, :encoding
def initialize(options = {})
@start_page = options[:start_page]
@continue_regexp = options[:continue_regexp]
@capture_regexp = options[:capture_regexp]
@named_captures = options[:named_captures]
@model = options[:model]
@save_method = options[:save_method]
+ @headers = options[:headers]
+ @encoding = options[:encoding]
end
def capture_regexp=(regexp)
@capture_regexp = Regexp.new(regexp.source, regexp.options | Regexp::MULTILINE)
end
@@ -27,35 +29,37 @@
@results
end
private
def parse_page(uri)
- response = Net::HTTP.get_response(uri)
+ response = Net::HTTP.start(uri.host, uri.port) do |http|
+ http.get(uri.request_uri, headers)
+ end
parse_response(response, uri)
end
def parse_response(response, uri)
+ response_body = Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first if encoding
if response.is_a? Net::HTTPSuccess
if continue_regexp
- response.body.scan(continue_regexp).each do |page|
+ response_body.scan(continue_regexp).each do |page|
page = page.first if page.is_a? Array
continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
@pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
end
end
- md = @capture_regexp.match(response.body)
+ md = @capture_regexp.match(response_body)
if md
captures = md.captures if md
result = {}
captures.each_index do |i|
result[named_captures[i].to_sym] = captures[i]
end
- url = "#{uri.scheme}://#{uri.host}#{uri.path}"
if @save_method
- ret = @save_method.call(result, url)
+ ret = @save_method.call(result, uri.to_s)
@stop = true if ret == false
else
- @results << {@model.downcase.to_sym => result, :page => url}
+ @results << {@model.downcase.to_sym => result, :page => uri.to_s}
end
end
elsif response.is_a? Net::HTTPRedirection
parse_page(URI.parse(response['location']))
else