Sha256: 9df394dbd19e839db965927fc1ba32ce0ab7530139408ce5e2e34adaff0cbf40

Contents?: true

Size: 1.22 KB

Versions: 1

Compression:

Stored size: 1.22 KB

Contents

require 'nokogiri'
require 'open-uri'
module Wiki::Yggdrasil
  
  class Article
    attr_reader :uri
    
    def initialize(uri:)
      raise ArgumentError unless Wiki::Yggdrasil::Article.is_valid_wiki_article?(uri: uri)
      @uri         = uri
      @summary     = nil
      @child_links = nil
    end

    def summary
      @summary ||= Nokogiri::HTML(Nokogiri::HTML(open(self.uri)).to_s.split('<div id="toc" class="toc">')[0]).css('p') ## TODO: Cleanup
    end

    def child_links
      formatted_links = format_links
      validated_links =  formatted_links.select { |uri| Wiki::Yggdrasil::Article.is_valid_wiki_article?(uri: uri) }
      @child_links  ||= validated_links
    end

    def scrape_all_summary_links
      self.summary.css('p a')
    end

    def format_links(anchors: self.scrape_all_summary_links)
      uris = anchors.map do |anchor|
        anchor.nil? || anchor['href'].nil? ? next : 'https://en.wikipedia.org' << anchor['href'] ## nil href attributes are often self refs (but possibly not always). Ignore them.
      end

      uris.compact
    end
    
    def self.is_valid_wiki_article?(uri:)
      ## Is this URI a wikipedia article?
      uri =~ /.*wikipedia\.org\/wiki\/.+/ ? true : false
    end
    
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wiki-yggdrasil-0.1.0 lib/wiki/article.rb