lib/retriever/page.rb in rubyretriever-1.0.3 vs lib/retriever/page.rb in rubyretriever-1.1.0
- old
+ new
@@ -1,9 +1,8 @@
module Retriever
-
+ #
class Page
-
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
HTTP_RE = Regexp.new(/^http/i).freeze
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
@@ -12,57 +11,57 @@
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
attr_reader :links, :source, :t
- def initialize(source,t)
+ def initialize(source, t)
@t = t
@source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
@links = nil
end
- #recieves page source as string
- #returns array of unique href links
+ # recieves page source as string
+ # returns array of unique href links
def links
return @links if @links
- return false if !@source
- @links = @source.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
+ return false unless @source
+ @links = @source.scan(HREF_CONTENTS_RE).map do |match|
+ # filter some malformed URLS that come in
+ # meant to be a loose filter to catch all reasonable HREF attributes.
link = match[0]
Link.new(@t.host, link).path
end.uniq
end
- def parseInternal
- links.select{ |linky| (@t.host_re =~ linky) }
+ def parse_internal
+ links.select { |linky| (@t.host_re =~ linky) }
end
- def parseInternalVisitable
- parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
+ def parse_internal_visitable
+ parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
end
- def parseFiles
- links.select{ |linky| (@t.file_re =~ linky)}
+ def parse_files
+ links.select { |linky| (@t.file_re =~ linky) }
end
def title
- TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
end
def desc
- DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
end
def h1
- H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
+ H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
end
def h2
- H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
+ H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
end
- def parseSEO
- return [title,desc,h1,h2]
+ def parse_seo
+ [title, desc, h1, h2]
end
-
end
-
end