lib/socialcrawler.rb in socialcrawler-0.0.3 vs lib/socialcrawler.rb in socialcrawler-0.0.4
- old
+ new
@@ -26,11 +26,16 @@
def initialize
@map = {
twitter: 'twitter.com/',
facebook: 'facebook.com/',
- google_plus: 'plus.google.com/'
+ google_plus: 'plus.google.com/',
+ instagram: 'www.instagram.com',
+ you_tube: 'youtube.com/user',
+ pinterest: 'pinterest.com/',
+ linked_in: 'linkedin.com/',
+ flickr: 'flickr.com/'
}
end
def _put(hash, symbol, value, log=nil)
log = Logger.new(STDOUT) if log.nil?
@@ -55,11 +60,11 @@
end
def crawl_url(url, log=nil)
log = Logger.new(STDOUT) if log.nil?
log.info("Crawling #{url}")
- result = Hash.new('NOT FOUND')
+ result = Hash.new(:NOT_FOUND)
begin
page = Nokogiri::HTML(open(url))
title = page.css('title')
if not title.nil?
result[:title] = title.text.strip
@@ -79,17 +84,11 @@
def load_status_cache(status_filename, log=nil)
status = Hash.new
if not status_filename.nil? and File.exists?(status_filename)
log.info("Loading previous status from #{status_filename}")
CSV.foreach(status_filename) do |row|
- if row.count >= 3
- status[row[0]] = {
- :url => row[0],
- :result => row[1],
- :message => row[2]
- }
- end
+ set_status_cache_data(status, row)
end
log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
end
return status
end
@@ -99,20 +98,11 @@
log.info("Loading previous status from #{output_list_filename}")
if not File.exist?(output_list_filename)
return data
end
CSV.foreach(output_list_filename) do |row|
- log.info("Loading #{row} #{row.count}")
- if row.count >= 5
- data[row[0]] = {
- :url => row[0],
- :title => row[1],
- :twitter => row[2],
- :facebook => row[3],
- :google_plus => row[4]
- }
- end
+ set_output_cache_data(data, row)
log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
end
return data
end
@@ -123,17 +113,13 @@
status = load_status_cache(status_filename, log)
data = load_output_cache(output_list_filename, log)
CSV.open(output_list_filename, "wb") do |output|
- data.each do |k, v|
- output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
- end
+ write_data(data, output)
CSV.open(status_filename, "wb") do |status_line|
- status.each do |k, v|
- status_line << [k, v[:success], v[:message]]
- end
+ write_status(status, status_line)
crawl_loop(data, domain_list_filename, log, output, status, status_line)
end
end
end
@@ -149,10 +135,22 @@
end
end
private
+ def write_data(data, output)
+ data.each do |k, v|
+ output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
+ end
+ end
+
+ def write_status(status, status_line)
+ status.each do |k, v|
+ status_line << [k, v[:success], v[:message]]
+ end
+ end
+
def set_data(result, url, data, output)
if result[:success] == true
data[url] = result
output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
end
@@ -164,13 +162,35 @@
:result => result[:success],
:message => result[:message]
}
status_line << [url, result[:success], result[:message]]
end
+
+ def set_output_cache_data(data, row)
+ if row.count >= 5
+ data[row[0]] = {
+ :url => row[0],
+ :title => row[1],
+ :twitter => row[2],
+ :facebook => row[3],
+ :google_plus => row[4]
+ }
+ end
+ end
+
+ def set_status_cache_data(status, row)
+ if row.count >= 3
+ status[row[0]] = {
+ :url => row[0],
+ :result => row[1],
+ :message => row[2]
+ }
+ end
+ end
end
end
if __FILE__ == $0
- # :nocov:
+ #:nocov:
SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
- # :nocov:
+ #:nocov:
end
\ No newline at end of file