lib/retriever/page.rb in rubyretriever-1.2.4 vs lib/retriever/page.rb in rubyretriever-1.3.0
- old
+ new
@@ -1,7 +1,8 @@
require 'addressable/uri'
-
+#
+using SourceString
module Retriever
#
class Page
HTTP_RE = Regexp.new(/^http/i).freeze
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
@@ -28,15 +29,16 @@
NONPAGE_EXT_RE = Regexp.new(/\.
(?:css|js|png|gif|jpg|mp4|
wmv|flv|mp3|wav|doc|txt|ico|xml)
/ix).freeze
- attr_reader :links, :source, :t
+ attr_reader :links, :source, :t, :url
- def initialize(source, t)
+ def initialize(url, source, t)
+ @url = url
@t = t
- @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
+ @source = source.encode_utf8_and_replace
@links = nil
end
# recieves page source as string
# returns array of unique href links
@@ -57,27 +59,27 @@
def parse_internal_visitable
parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
end
- def parse_files(arr)
+ def parse_files(arr = parse_internal)
arr.select { |x| @t.file_re =~ x }
end
def title
- TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
end
def desc
- DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : ''
end
def h1
- H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
+ H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : ''
end
def h2
- H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
+ H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : ''
end
def parse_seo
[title, desc, h1, h2]
end