Sha256: 5b419d343dacbdd35c7ecefbc245a366613f11e54b0b7e7dde024ae41b27ddf4

Contents?: true

Size: 1.95 KB

Versions: 11

Compression:

Stored size: 1.95 KB

Contents

module MetaInspector
  module Parsers
    class LinksParser < Base
      delegate [:parsed, :url, :scheme, :host] => :@main_parser

      def links
        self
      end

      # Returns all links found, unprocessed
      def raw
        @raw ||= cleanup(parsed.search('//a/@href')).compact.uniq
      end

      # Returns all links found, unrelavitized and absolutified
      def all
        @all ||= raw.map { |link| URL.absolutify(URL.unrelativize(link, scheme), base_url) }
                    .compact.uniq
      end

      # Returns all HTTP links found
      def http
        @http ||= all.select { |link| link =~ /^http(s)?:\/\//i}
      end

      # Returns all non-HTTP links found
      def non_http
        @non_http ||= all.select { |link| link !~ /^http(s)?:\/\//i}
      end

      # Returns all internal HTTP links found
      def internal
        @internal ||= http.select { |link| URL.new(link).host == host }
      end

      # Returns all external HTTP links found
      def external
        @external ||= http.select { |link| URL.new(link).host != host }
      end

      def to_hash
        { 'internal' => internal,
          'external' => external,
          'non_http' => non_http }
      end

      # Returns the parsed document meta rss link
      def feed
        @feed ||= (parsed_feed('rss') || parsed_feed('atom'))
      end

      # Returns the base url to absolutify relative links.
      # This can be the one set on a <base> tag,
      # or the url of the document if no <base> tag was found.
      def base_url
        base_href || url
      end

      private

      def parsed_feed(format)
        feed = parsed.search("//link[@type='application/#{format}+xml']").first
        feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
      end

      # Returns the value of the href attribute on the <base /> tag, if exists
      def base_href
        parsed.search('base').first.attributes['href'].value rescue nil
      end
    end
  end
end

Version data entries

11 entries across 11 versions & 1 rubygems

Version Path
metainspector-4.4.2 lib/meta_inspector/parsers/links.rb
metainspector-4.4.1 lib/meta_inspector/parsers/links.rb
metainspector-4.4.0 lib/meta_inspector/parsers/links.rb
metainspector-4.3.3 lib/meta_inspector/parsers/links.rb
metainspector-4.3.2 lib/meta_inspector/parsers/links.rb
metainspector-4.3.1 lib/meta_inspector/parsers/links.rb
metainspector-4.3.0 lib/meta_inspector/parsers/links.rb
metainspector-4.2.1 lib/meta_inspector/parsers/links.rb
metainspector-4.2.0 lib/meta_inspector/parsers/links.rb
metainspector-4.1.0 lib/meta_inspector/parsers/links.rb
metainspector-4.0.0 lib/meta_inspector/parsers/links.rb