lib/retriever/page.rb in rubyretriever-1.4.1 vs lib/retriever/page.rb in rubyretriever-1.4.2

- old
+ new

@@ -3,10 +3,11 @@ # using SourceString module Retriever # class Page + HASH_RE = Regexp.new(/^#/i).freeze HTTP_RE = Regexp.new(/^http/i).freeze H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'] @@ -48,9 +49,10 @@ return false unless @source @links = @source.scan(HREF_CONTENTS_RE).map do |match| # filter some malformed URLS that come in # meant to be a loose filter to catch all reasonable HREF attributes. link = match[0] + next if HASH_RE =~ link Link.new(@t.scheme, @t.host, link, @url).path end.compact.uniq end def parse_internal