Sha256: 269d9a1ccf3a90d1f013731a71d2463ab7806d9be0cb039c88510d895740c93a

Contents?: true

Size: 1.33 KB

Versions: 3

Compression:

Stored size: 1.33 KB

Contents

class TaiwaneseNewsParser::Parser::LibertyTimesBig5 < TaiwaneseNewsParser::Parser
  def self.domain
    'libertytimes.com.tw'
  end

  def self.names
    %{自由時報}
  end

  def self.applicable?(url)
    url.match(%r{libertytimes\.com\.tw/\d+/\w+/\w+/\d+/.+\.htm})
  end

  #url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
  def parse
    @article[:title] = doc.at_css('#newtitle').text
    @article[:company_name] = parse_company_name
    @article[:content] = doc.css('#newsContent>span:not(#newtitle)>p:not(.picture)').text

    @article[:reporter_name] = parse_reporter_name()
    @article[:published_at] = Time.parse(doc.at_css('#date').text)

    clean_up

    @article
  end

  def parse_reporter_name
    if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
      reporter_name = match[1][%r{記者(.+)},1]
    elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
      reporter_name = match[1]
    elsif match = @article[:content].match(%r{(文/(.*?))})
      reporter_name = match[1]
    end
    reporter_name
  end

  def parse_company_name
    '自由時報'
  end

  def clean_url
    cleaner = TaiwaneseNewsParser::UrlCleaner.new('')
    @article[:url] = cleaner.clean(@article[:url])
  end

  def self.parse_url_id(url)
    url[%r{http://www\.libertytimes\.com\.tw/(.*)\.htm},1]
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
taiwanese_news_parser-0.0.3 lib/taiwanese_news_parser/parser/liberty_times_big5.rb
taiwanese_news_parser-0.0.2 lib/taiwanese_news_parser/parser/liberty_times_big5.rb
taiwanese_news_parser-0.0.1 lib/taiwanese_news_parser/parser/liberty_times_big5.rb