Sha256: 2e40f6aabe2b09dfe772da67dafc7a91d7e20aac22946a9caed33877f2bd723f
Contents?: true
Size: 1.42 KB
Versions: 2
Compression:
Stored size: 1.42 KB
Contents
require 'taiwanese_news_parser/url_cleaner' require 'memoist' class TaiwaneseNewsParser::Parser extend Memoist attr_accessor :url attr_reader :article def self.applicable?(url) url.include?(domain()) end def self.applicable_parser(url) redirected_url = open(url).base_uri.to_s parser_class = subclasses.find do |parser_class| parser_class.applicable?(redirected_url) end if parser_class parser_class.new(redirected_url) end end def initialize(url) @url = url @article = {} @article[:url] = url @article[:web_domain] = self.class.domain() @article[:url_id] = self.class.parse_url_id(url) end def doc @raw = open(url).read.encode('utf-8', 'big5', :invalid => :replace, :undef => :replace, :replace => '') @doc = ::Nokogiri::HTML(@raw,url) end memoize :doc def clean_up [:content, :title, :reporter_name, :company_name].each do |attr| @article[attr].strip! if @article[attr] end clean_url if respond_to?(:clean_url) @article[:reproduced] = reproduced? end def reproduced? !self.class.names.include?(parse_company_name) end Dir[File.dirname(__FILE__) + '/parser/*.rb'].each{|file| require file} def self.subclasses [ Udn, LibertyTimes, LibertyTimesBig5, LibertyTimesNews, ChinaTimes, ChinaTimesMoney, Cna, AppleDaily, Ettoday, Tvbs, Cts, NowNews ] end def self.domain raise NotImplementedError end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
taiwanese_news_parser-0.0.3 | lib/taiwanese_news_parser/parser.rb |
taiwanese_news_parser-0.0.2 | lib/taiwanese_news_parser/parser.rb |