lib/retriever/page.rb in rubyretriever-1.2.4 vs lib/retriever/page.rb in rubyretriever-1.3.0

- old
+ new

@@ -1,7 +1,8 @@ require 'addressable/uri' - +# +using SourceString module Retriever # class Page HTTP_RE = Regexp.new(/^http/i).freeze H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze @@ -28,15 +29,16 @@ NONPAGE_EXT_RE = Regexp.new(/\. (?:css|js|png|gif|jpg|mp4| wmv|flv|mp3|wav|doc|txt|ico|xml) /ix).freeze - attr_reader :links, :source, :t + attr_reader :links, :source, :t, :url - def initialize(source, t) + def initialize(url, source, t) + @url = url @t = t - @source = source.encode('UTF-8', invalid: :replace, undef: :replace) + @source = source.encode_utf8_and_replace @links = nil end # recieves page source as string # returns array of unique href links @@ -57,27 +59,27 @@ def parse_internal_visitable parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) } end - def parse_files(arr) + def parse_files(arr = parse_internal) arr.select { |x| @t.file_re =~ x } end def title - TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : '' + TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : '' end def desc - DESC_RE =~ @source ? @source.match(DESC_RE)[1] : '' + DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : '' end def h1 - H1_RE =~ @source ? @source.match(H1_RE)[1] : '' + H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : '' end def h2 - H2_RE =~ @source ? @source.match(H2_RE)[1] : '' + H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : '' end def parse_seo [title, desc, h1, h2] end