lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.1.0 vs lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.2.0
- old
+ new
@@ -14,13 +14,15 @@
@capture_regexp = Regexp.new(regexp.source, regexp.options | Regexp::MULTILINE)
end
def start
results = []
- @pages = [@start_page]
+ @captured_pages = []
+ @pages = [URI.parse(@start_page)]
while !@pages.empty?
- uri = URI.parse(@pages.shift)
+ uri = @pages.shift
+ @captured_pages << uri
result = parse_page(uri)
results << result if result
end
results
end
@@ -31,11 +33,12 @@
end
def parse_response(response, uri)
if response.is_a? Net::HTTPSuccess
response.body.scan(continue_regexp).each do |page|
- url = page.start_with?(uri.scheme) ? page : "#{uri.scheme}://#{uri.host}/#{page}"
- @pages << url
+ page = page.first if page.is_a? Array
+ continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
+ @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
end if continue_regexp
md = @capture_regexp.match(response.body)
if md
model_result = model.new
captures = md.captures if md