lib/retriever/page.rb in rubyretriever-1.4.1 vs lib/retriever/page.rb in rubyretriever-1.4.2
- old
+ new
@@ -3,10 +3,11 @@
#
using SourceString
module Retriever
#
class Page
+ HASH_RE = Regexp.new(/^#/i).freeze
HTTP_RE = Regexp.new(/^http/i).freeze
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
@@ -48,9 +49,10 @@
return false unless @source
@links = @source.scan(HREF_CONTENTS_RE).map do |match|
# filter some malformed URLS that come in
# meant to be a loose filter to catch all reasonable HREF attributes.
link = match[0]
+ next if HASH_RE =~ link
Link.new(@t.scheme, @t.host, link, @url).path
end.compact.uniq
end
def parse_internal