# Copyright (c) 2009 [Cyril David]
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
require 'nokogiri'
require 'open-uri'
module Scraper
class Article
class Unsupported < StandardError; end
BAD_CLASS_NAMES = /(comment|meta|footer|footnote)/
GOOD_CLASS_NAMES = /((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/
BAD_ID_NAMES = /(comment|meta|footer|footnote)/
GOOD_ID_NAMES = /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/
attr_reader :title
def self.=~( args )
begin
@article = Scraper::Article.new( args )
rescue Scraper::Article::Unsupported
return nil
end
end
# Usage:
# ======
#
# require 'open-uri'
# @resource = open("http://tinyurl.com/ys9wt")
# @article = Scraper::Article.new(@resource.read)
# @article.title
# => "Open Source Initiative OSI - The MIT License:Licensing ..."
#
# @article.text
# => "The MIT License\nCopyright (c)
[ \r\n\s]*
/
content.gsub(pattern, '
').gsub(/<\/?font[^>]*>/, '') end def get_char_count( node, char = ',' ) node.content.split(char).length end def clean_styles!( node ) node.search('*').remove_attr('style') end def kill_divs!( node ) node.search('div').each do |div| p = div.search('p').length img = div.search('img').length li = div.search('li').length a = div.search('a').length embed = div.search('embed').length if get_char_count( div ) < 10 if img > p || li > p || a > p || p == 0 || embed > 0 div.remove end end end end def clean_tags!(node, tags, min_words = 1000000) node.search(tags).each do |target| if get_char_count( target, " " ) < min_words target.remove end end end end end