page.rb in rubyretriever-1.4.2

- old
+ new

@@ -3,10 +3,11 @@
 #
 using SourceString
 module Retriever
   #
   class Page
+    HASH_RE   = Regexp.new(/^#/i).freeze
     HTTP_RE   = Regexp.new(/^http/i).freeze
     H1_RE     = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
     H2_RE     = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
     TITLE_RE  = Regexp.new(/<title>(.*)<\/title>/i).freeze
     DESC_RE   = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
@@ -48,9 +49,10 @@
       return false unless @source
       @links = @source.scan(HREF_CONTENTS_RE).map do |match|
         # filter some malformed URLS that come in
         # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
+        next if HASH_RE =~ link
         Link.new(@t.scheme, @t.host, link, @url).path
       end.compact.uniq
     end
 
     def parse_internal