lib/retriever/page.rb in rubyretriever-1.1.0 vs lib/retriever/page.rb in rubyretriever-1.2.0
- old
+ new
@@ -1,23 +1,42 @@
+require 'addressable/uri'
+
module Retriever
#
class Page
- HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
- NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
- HTTP_RE = Regexp.new(/^http/i).freeze
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+ HTTP_RE = Regexp.new(/^http/i).freeze
+ H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
+ H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
+ TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
+ DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
+ [^>]*content=[\"]
+ (
+ [^\"]*
+ )
+ [\"]
+ [^>]
+ *>
+ /ix).freeze
+ HREF_CONTENTS_RE = Regexp.new(/\shref=
+ ['|"]
+ (
+ [^\s]
+ [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
+ )
+ ['|"]
+ [\s|\W]
+ /ix).freeze
+ NONPAGE_EXT_RE = Regexp.new(/\.
+ (?:css|js|png|gif|jpg|mp4|
+ wmv|flv|mp3|wav|doc|txt|ico|xml)
+ /ix).freeze
- TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
- DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
- H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
- H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
-
attr_reader :links, :source, :t
def initialize(source, t)
@t = t
- @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
+ @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
@links = nil
end
# recieves page source as string
# returns array of unique href links
@@ -26,23 +45,23 @@
return false unless @source
@links = @source.scan(HREF_CONTENTS_RE).map do |match|
# filter some malformed URLS that come in
# meant to be a loose filter to catch all reasonable HREF attributes.
link = match[0]
- Link.new(@t.host, link).path
- end.uniq
+ Link.new(@t.scheme, @t.host, link).path
+ end.compact.uniq
end
def parse_internal
- links.select { |linky| (@t.host_re =~ linky) }
+ links.select { |x| @t.host == Addressable::URI.parse(x).host }
end
def parse_internal_visitable
- parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
+ parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
end
- def parse_files
- links.select { |linky| (@t.file_re =~ linky) }
+ def parse_files(arr)
+ arr.select { |x| @t.file_re =~ x }
end
def title
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
end