lib/retriever/page.rb in rubyretriever-1.1.0 vs lib/retriever/page.rb in rubyretriever-1.2.0

- old
+ new

@@ -1,23 +1,42 @@ +require 'addressable/uri' + module Retriever # class Page - HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze - NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze - HTTP_RE = Regexp.new(/^http/i).freeze - DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze + HTTP_RE = Regexp.new(/^http/i).freeze + H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze + H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze + TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze + DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'] + [^>]*content=[\"] + ( + [^\"]* + ) + [\"] + [^>] + *> + /ix).freeze + HREF_CONTENTS_RE = Regexp.new(/\shref= + ['|"] + ( + [^\s] + [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+ + ) + ['|"] + [\s|\W] + /ix).freeze + NONPAGE_EXT_RE = Regexp.new(/\. + (?:css|js|png|gif|jpg|mp4| + wmv|flv|mp3|wav|doc|txt|ico|xml) + /ix).freeze - TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze - DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze - H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze - H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze - attr_reader :links, :source, :t def initialize(source, t) @t = t - @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace) + @source = source.encode('UTF-8', invalid: :replace, undef: :replace) @links = nil end # recieves page source as string # returns array of unique href links @@ -26,23 +45,23 @@ return false unless @source @links = @source.scan(HREF_CONTENTS_RE).map do |match| # filter some malformed URLS that come in # meant to be a loose filter to catch all reasonable HREF attributes. link = match[0] - Link.new(@t.host, link).path - end.uniq + Link.new(@t.scheme, @t.host, link).path + end.compact.uniq end def parse_internal - links.select { |linky| (@t.host_re =~ linky) } + links.select { |x| @t.host == Addressable::URI.parse(x).host } end def parse_internal_visitable - parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) } + parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) } end - def parse_files - links.select { |linky| (@t.file_re =~ linky) } + def parse_files(arr) + arr.select { |x| @t.file_re =~ x } end def title TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : '' end