Sha256: b3dc4647a9e826016a7eb7de477494e316f84c22ec2287c1e8c400dfd476c39a

Contents?: true

Size: 1.09 KB

Versions: 4

Compression:

Stored size: 1.09 KB

Contents

require 'nokogiri'
require 'open-uri'

module Scraper
  module Modules
    module Web
      def self.open( *args )
        Kernel.open( *args )
      end
      
      module MetaData
        def self.included( base )
          base.cattr_accessor :config
        end

        def title
          @title ||= doc.search(config[:title_selector]).first.content.strip
        end
        
        def description
          return @description if @description
          
          if element = doc.search(config[:description_selector]).first
            html         = element.inner_html
            html.gsub!(/<br\/?>/u, ' ')
            html.gsub!("\302\240", ' ')
            @description = dom(html).content.strip
          else
            @description = ''
          end
        end
        
        protected
          def dom( html )
            Nokogiri::HTML( html )
          end
          
          def uri
            @uri.scheme + '://' + @uri.host + @uri.request_uri
          end
          
          def doc
            @doc ||= dom( Modules::Web.open( uri ).read )
          end
      end
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
cyx-scraper-0.4.0 lib/scraper/modules/web.rb
cyx-scraper-0.4.1 lib/scraper/modules/web.rb
cyx-scraper-0.4.2 lib/scraper/modules/web.rb
cyx-scraper-0.4.3 lib/scraper/modules/web.rb