lib/retriever/page.rb in rubyretriever-1.3.0 vs lib/retriever/page.rb in rubyretriever-1.4.0
- old
+ new
@@ -1,5 +1,6 @@
+require 'nokogiri'
require 'addressable/uri'
#
using SourceString
module Retriever
#
@@ -38,32 +39,39 @@
@t = t
@source = source.encode_utf8_and_replace
@links = nil
end
- # recieves page source as string
+ # receives page source as string
# returns array of unique href links
def links
return @links if @links
return false unless @source
@links = @source.scan(HREF_CONTENTS_RE).map do |match|
# filter some malformed URLS that come in
# meant to be a loose filter to catch all reasonable HREF attributes.
link = match[0]
- Link.new(@t.scheme, @t.host, link).path
+ Link.new(@t.scheme, @t.host, link, @url).path
end.compact.uniq
end
def parse_internal
- links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
+ links.select do |x|
+ @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
+ end
end
def parse_internal_visitable
parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
end
def parse_files(arr = parse_internal)
arr.select { |x| @t.file_re =~ x }
+ end
+
+ def parse_by_css(selector)
+ nokogiri_doc = Nokogiri::HTML(@source)
+ nokogiri_doc.css(selector).text
end
def title
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
end