lib/retriever/page.rb in rubyretriever-1.3.0 vs lib/retriever/page.rb in rubyretriever-1.4.0

- old
+ new

@@ -1,5 +1,6 @@ +require 'nokogiri' require 'addressable/uri' # using SourceString module Retriever # @@ -38,32 +39,39 @@ @t = t @source = source.encode_utf8_and_replace @links = nil end - # recieves page source as string + # receives page source as string # returns array of unique href links def links return @links if @links return false unless @source @links = @source.scan(HREF_CONTENTS_RE).map do |match| # filter some malformed URLS that come in # meant to be a loose filter to catch all reasonable HREF attributes. link = match[0] - Link.new(@t.scheme, @t.host, link).path + Link.new(@t.scheme, @t.host, link, @url).path end.compact.uniq end def parse_internal - links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host } + links.select do |x| + @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host + end end def parse_internal_visitable parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) } end def parse_files(arr = parse_internal) arr.select { |x| @t.file_re =~ x } + end + + def parse_by_css(selector) + nokogiri_doc = Nokogiri::HTML(@source) + nokogiri_doc.css(selector).text end def title TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : '' end