lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.2.0 vs lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.3.0
- old
+ new
@@ -25,33 +25,36 @@
results << result if result
end
results
end
- def parse_page(uri)
- response = Net::HTTP.get_response(uri)
- parse_response(response, uri)
- end
+ private
+ def parse_page(uri)
+ response = Net::HTTP.get_response(uri)
+ parse_response(response, uri)
+ end
- def parse_response(response, uri)
- if response.is_a? Net::HTTPSuccess
- response.body.scan(continue_regexp).each do |page|
- page = page.first if page.is_a? Array
- continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
- @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
- end if continue_regexp
- md = @capture_regexp.match(response.body)
- if md
- model_result = model.new
- captures = md.captures if md
- captures.each_index do |i|
- model_result.send("#{named_captures[i]}=", captures[i])
+ def parse_response(response, uri)
+ if response.is_a? Net::HTTPSuccess
+ if continue_regexp
+ response.body.scan(continue_regexp).each do |page|
+ page = page.first if page.is_a? Array
+ continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
+ @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
+ end
end
- {:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
+ md = @capture_regexp.match(response.body)
+ if md
+ captures = md.captures if md
+ result = {}
+ captures.each_index do |i|
+ result[named_captures[i].to_sym] = captures[i]
+ end
+ {@model.downcase.to_sym => result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
+ end
+ elsif response.is_a? Net::HTTPRedirection
+ parse_page(URI.parse(response['location']))
+ else
end
- elsif response.is_a? Net::HTTPRedirection
- parse_page(URI.parse(response['location']))
- else
end
end
- end
end