lib/socialcrawler.rb in socialcrawler-0.0.0 vs lib/socialcrawler.rb in socialcrawler-0.0.2

- old
+ new

@@ -20,141 +20,157 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA module SocialCrawler + class SocialCrawler - def self._put( hash, symbol , value , log=nil) - log = Logger.new(STDOUT) if log.nil? - if not hash.has_key?( symbol) - hash[symbol] = value - else - hash[symbol] = "#{hash[symbol]} #{value}" - log.info( "Multiple values for #{symbol} value #{hash[symbol]}") + def initialize + @map = { + twitter: 'twitter.com/', + facebook: 'facebook.com/', + google_plus: 'plus.google.com/' + } end - end - def self.crawl_url(url,log=nil) - log = Logger.new(STDOUT) if log.nil? - log.info( "Crawling #{url}") - result = Hash.new('NOT FOUND') - begin - page = Nokogiri::HTML(open(url)) - title = page.css('title') - if not title.nil? - result[:title] = title.text.strip + def _put(hash, symbol, value, log=nil) + log = Logger.new(STDOUT) if log.nil? + if not hash.has_key?(symbol) + hash[symbol] = value + else + hash[symbol] = "#{hash[symbol]} #{value}" + log.info("Multiple values for #{symbol} value #{hash[symbol]}") end + end + + def page_to_result(page, result, log) links = page.css('a[href]') links.each do |link| link_url = link['href'] + @map.each do |k, prefix| + if not link_url.index(prefix).nil? + _put(result, k, link_url, log) + end + end + end + end - if not link_url.index('twitter.com/').nil? - log.info( "twitter #{link_url} for #{url}") - _put(result,:twitter,link_url,log) + def crawl_url(url, log=nil) + log = Logger.new(STDOUT) if log.nil? + log.info("Crawling #{url}") + result = Hash.new('NOT FOUND') + begin + page = Nokogiri::HTML(open(url)) + title = page.css('title') + if not title.nil? + result[:title] = title.text.strip end - if not link_url.index('facebook.com/').nil? - log.info( "facebook #{link_url} for #{url}") - _put(result,:facebook,link_url,log) + page_to_result(page, result, log) + result[:url] = url + result[:success] = true + result[:message] = '' + rescue Exception => e + result[:url] = url + result[:success] = false + result[:message] = "#{e}" + end + return result + end + + def load_status_cache(status_filename, log=nil) + status = Hash.new + if not status_filename.nil? and File.exists?(status_filename) + log.info("Loading previous status from #{status_filename}") + CSV.foreach(status_filename) do |row| + if row.count < 3 + next + end + url = row[0] + result = row[1] + message = row[2] + status[url] = { + :url => url, + :result => result, + :message => message + } end - if not link_url.index('plus.google.com/').nil? - log.info( "google_plus #{link_url} for #{url}") - _put(result,:google_plus,link_url,log) - end + log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.") end - result[:url] = url - result[:success] = true - result[:message] = '' - rescue Exception => e - result[:url] = url - result[:success] = false - result[:message] = "#{e}" + return status end - return result - end - def self.crawl( domain_list_filename, output_list_filename, status_filename=nil , log=nil) - log = Logger.new(STDOUT) if log.nil? - log.info( "Crawler started") - status = Hash.new - if not status_filename.nil? and File.exists?(status_filename) - log.info( "Loading previous status from #{status_filename}") - CSV.foreach( status_filename ) do |row| - begin + def load_output_cache(output_list_filename, log=nil) + data = Hash.new() + log.info("Loading previous status from #{output_list_filename}") + if not File.exist?(output_list_filename) + return data + end + CSV.foreach(output_list_filename) do |row| + log.info("Loading #{row} #{row.count}") + if row.count < 5 + next + end url = row[0] - result = row[1] - message = row[2] - status[url] = { + title= row[1] + twitter = row[2] + facebook = row[3] + google_plus = row[4] + data[url] = { :url => url, - :result => result, - :message => message + :title => title, + :twitter => twitter, + :facebook => facebook, + :google_plus => google_plus } - rescue Exception => e - log.info("Exception reading file #{e}") - end + log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.") end - log.info( "Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.") + return data end - data = Hash.new() - if File.exist?(output_list_filename) - log.info( "Loading previous status from #{output_list_filename}") - CSV.open( output_list_filename ) do |row| - if row.count >= 5 - url = row[0] - title= row[1] - twitter = row[2] - facebook = row[3] - google_plus = row[4] - data[url] = { - :url => url, - :title => title, - :twitter => twitter, - :facebook => facebook, - :google_plus => google_plus - } - end + def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil) + log = Logger.new(STDOUT) if log.nil? + log.info("Crawler started") + + status = load_status_cache(status_filename, log) + + data = load_output_cache(output_list_filename, log) + + CSV.open(output_list_filename, "wb") do |output| + data.each do |k, v| + output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]] + end + CSV.open(status_filename, "wb") do |status_line| + status.each do |k, v| + status_line << [k, v[:success], v[:message]] + end + crawl_loop(data, domain_list_filename, log, output, status, status_line) + end end - log.info( "Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.") end - CSV.foreach( domain_list_filename ) do |row| + def crawl_loop(data, domain_list_filename, log, output, status, status_line) + CSV.foreach(domain_list_filename) do |row| url = row[0] if status.has_key?(url) - # already visited, skip - else - result = crawl_url(url,log) - if result[:success] == true - CSV.open( output_list_filename, "wb") do |output| - data.each do |k,v| - log.info(k) - log.info(v) - output << [k,v[:title],v[:twitter],v[:facebook],v[:google_plus]] - end - output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]] - data[url] = result - end - status[url] = { - :url => url, - :result => 'success', - :message => '' - } - CSV.open( status_filename, "wb" ) do |status_line| - status_line << [url,'success',''] - end - else - status[url] = { - :url => url, - :result => result[:success], - :message => result[:message] - } - CSV.open( status_filename, "wb" ) do |status_line| - status_line << [url,result[:success],result[:message]] - end + next end + result = crawl_url(url, log) + if result[:success] == true + data[url] = result + output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]] + end + status[url] = { + :url => url, + :result => result[:success], + :message => result[:message] + } + status_line << [url, result[:success], result[:message]] end end end end if __FILE__ == $0 - SocialCrawler.crawl(ARGV[0],ARGV[1],ARGV[2]) + #:nocov: + SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2]) + #:nocov: end \ No newline at end of file