page.rb in rubyretriever-1.2.0

- old
+ new

@@ -1,23 +1,42 @@
+require 'addressable/uri'
+
 module Retriever
   #
   class Page
-    HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
-    NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
-    HTTP_RE = Regexp.new(/^http/i).freeze
-    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+    HTTP_RE   = Regexp.new(/^http/i).freeze
+    H1_RE     = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
+    H2_RE     = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
+    TITLE_RE  = Regexp.new(/<title>(.*)<\/title>/i).freeze
+    DESC_RE   = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
+                          [^>]*content=[\"]
+                          (
+                            [^\"]*
+                          )
+                          [\"]
+                          [^>]
+                          *>
+                          /ix).freeze
+    HREF_CONTENTS_RE = Regexp.new(/\shref=
+                                  ['|"]
+                                  (
+                                    [^\s]
+                                    [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
+                                  )
+                                  ['|"]
+                                  [\s|\W]
+                                  /ix).freeze
+    NONPAGE_EXT_RE = Regexp.new(/\.
+                                (?:css|js|png|gif|jpg|mp4|
+                                wmv|flv|mp3|wav|doc|txt|ico|xml)
+                                /ix).freeze
 
-    TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
-    DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
-    H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
-    H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
-
     attr_reader :links, :source, :t
 
     def initialize(source, t)
       @t = t
-      @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
+      @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
       @links = nil
     end
 
     # recieves page source as string
     # returns array of unique href links
@@ -26,23 +45,23 @@
       return false unless @source
       @links = @source.scan(HREF_CONTENTS_RE).map do |match|
         # filter some malformed URLS that come in
         # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
-        Link.new(@t.host, link).path
-      end.uniq
+        Link.new(@t.scheme, @t.host, link).path
+      end.compact.uniq
     end
 
     def parse_internal
-      links.select { |linky| (@t.host_re =~ linky) }
+      links.select { |x| @t.host == Addressable::URI.parse(x).host }
     end
 
     def parse_internal_visitable
-      parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
+      parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
     end
 
-    def parse_files
-      links.select { |linky| (@t.file_re =~ linky) }
+    def parse_files(arr)
+      arr.select { |x| @t.file_re =~ x }
     end
 
     def title
       TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
     end