page.rb in rubyretriever-1.3.0

- old
+ new

@@ -1,7 +1,8 @@
 require 'addressable/uri'
-
+#
+using SourceString
 module Retriever
   #
   class Page
     HTTP_RE   = Regexp.new(/^http/i).freeze
     H1_RE     = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
@@ -28,15 +29,16 @@
     NONPAGE_EXT_RE = Regexp.new(/\.
                                 (?:css|js|png|gif|jpg|mp4|
                                 wmv|flv|mp3|wav|doc|txt|ico|xml)
                                 /ix).freeze
 
-    attr_reader :links, :source, :t
+    attr_reader :links, :source, :t, :url
 
-    def initialize(source, t)
+    def initialize(url, source, t)
+      @url = url
       @t = t
-      @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
+      @source = source.encode_utf8_and_replace
       @links = nil
     end
 
     # recieves page source as string
     # returns array of unique href links
@@ -57,27 +59,27 @@
 
     def parse_internal_visitable
       parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
     end
 
-    def parse_files(arr)
+    def parse_files(arr = parse_internal)
       arr.select { |x| @t.file_re =~ x }
     end
 
     def title
-      TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
+      TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
     end
 
     def desc
-      DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
+      DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html  : ''
     end
 
     def h1
-      H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
+      H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html  : ''
     end
 
     def h2
-      H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
+      H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html  : ''
     end
 
     def parse_seo
       [title, desc, h1, h2]
     end