lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.2.0 vs lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.3.0

- old
+ new

@@ -25,33 +25,36 @@ results << result if result end results end - def parse_page(uri) - response = Net::HTTP.get_response(uri) - parse_response(response, uri) - end + private + def parse_page(uri) + response = Net::HTTP.get_response(uri) + parse_response(response, uri) + end - def parse_response(response, uri) - if response.is_a? Net::HTTPSuccess - response.body.scan(continue_regexp).each do |page| - page = page.first if page.is_a? Array - continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page) - @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri) - end if continue_regexp - md = @capture_regexp.match(response.body) - if md - model_result = model.new - captures = md.captures if md - captures.each_index do |i| - model_result.send("#{named_captures[i]}=", captures[i]) + def parse_response(response, uri) + if response.is_a? Net::HTTPSuccess + if continue_regexp + response.body.scan(continue_regexp).each do |page| + page = page.first if page.is_a? Array + continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page) + @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri) + end end - {:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"} + md = @capture_regexp.match(response.body) + if md + captures = md.captures if md + result = {} + captures.each_index do |i| + result[named_captures[i].to_sym] = captures[i] + end + {@model.downcase.to_sym => result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"} + end + elsif response.is_a? Net::HTTPRedirection + parse_page(URI.parse(response['location'])) + else end - elsif response.is_a? Net::HTTPRedirection - parse_page(URI.parse(response['location'])) - else end end - end end