page.rb in rubyretriever-1.4.0

- old
+ new

@@ -1,5 +1,6 @@
+require 'nokogiri'
 require 'addressable/uri'
 #
 using SourceString
 module Retriever
   #
@@ -38,32 +39,39 @@
       @t = t
       @source = source.encode_utf8_and_replace
       @links = nil
     end
 
-    # recieves page source as string
+    # receives page source as string
     # returns array of unique href links
     def links
       return @links if @links
       return false unless @source
       @links = @source.scan(HREF_CONTENTS_RE).map do |match|
         # filter some malformed URLS that come in
         # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
-        Link.new(@t.scheme, @t.host, link).path
+        Link.new(@t.scheme, @t.host, link, @url).path
       end.compact.uniq
     end
 
     def parse_internal
-      links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
+      links.select do |x|
+        @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
+      end
     end
 
     def parse_internal_visitable
       parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
     end
 
     def parse_files(arr = parse_internal)
       arr.select { |x| @t.file_re =~ x }
+    end
+
+    def parse_by_css(selector)
+      nokogiri_doc = Nokogiri::HTML(@source)
+      nokogiri_doc.css(selector).text
     end
 
     def title
       TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
     end