lib/socialcrawler.rb in socialcrawler-0.0.0 vs lib/socialcrawler.rb in socialcrawler-0.0.2
- old
+ new
@@ -20,141 +20,157 @@
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
module SocialCrawler
+ class SocialCrawler
- def self._put( hash, symbol , value , log=nil)
- log = Logger.new(STDOUT) if log.nil?
- if not hash.has_key?( symbol)
- hash[symbol] = value
- else
- hash[symbol] = "#{hash[symbol]} #{value}"
- log.info( "Multiple values for #{symbol} value #{hash[symbol]}")
+ def initialize
+ @map = {
+ twitter: 'twitter.com/',
+ facebook: 'facebook.com/',
+ google_plus: 'plus.google.com/'
+ }
end
- end
- def self.crawl_url(url,log=nil)
- log = Logger.new(STDOUT) if log.nil?
- log.info( "Crawling #{url}")
- result = Hash.new('NOT FOUND')
- begin
- page = Nokogiri::HTML(open(url))
- title = page.css('title')
- if not title.nil?
- result[:title] = title.text.strip
+ def _put(hash, symbol, value, log=nil)
+ log = Logger.new(STDOUT) if log.nil?
+ if not hash.has_key?(symbol)
+ hash[symbol] = value
+ else
+ hash[symbol] = "#{hash[symbol]} #{value}"
+ log.info("Multiple values for #{symbol} value #{hash[symbol]}")
end
+ end
+
+ def page_to_result(page, result, log)
links = page.css('a[href]')
links.each do |link|
link_url = link['href']
+ @map.each do |k, prefix|
+ if not link_url.index(prefix).nil?
+ _put(result, k, link_url, log)
+ end
+ end
+ end
+ end
- if not link_url.index('twitter.com/').nil?
- log.info( "twitter #{link_url} for #{url}")
- _put(result,:twitter,link_url,log)
+ def crawl_url(url, log=nil)
+ log = Logger.new(STDOUT) if log.nil?
+ log.info("Crawling #{url}")
+ result = Hash.new('NOT FOUND')
+ begin
+ page = Nokogiri::HTML(open(url))
+ title = page.css('title')
+ if not title.nil?
+ result[:title] = title.text.strip
end
- if not link_url.index('facebook.com/').nil?
- log.info( "facebook #{link_url} for #{url}")
- _put(result,:facebook,link_url,log)
+ page_to_result(page, result, log)
+ result[:url] = url
+ result[:success] = true
+ result[:message] = ''
+ rescue Exception => e
+ result[:url] = url
+ result[:success] = false
+ result[:message] = "#{e}"
+ end
+ return result
+ end
+
+ def load_status_cache(status_filename, log=nil)
+ status = Hash.new
+ if not status_filename.nil? and File.exists?(status_filename)
+ log.info("Loading previous status from #{status_filename}")
+ CSV.foreach(status_filename) do |row|
+ if row.count < 3
+ next
+ end
+ url = row[0]
+ result = row[1]
+ message = row[2]
+ status[url] = {
+ :url => url,
+ :result => result,
+ :message => message
+ }
end
- if not link_url.index('plus.google.com/').nil?
- log.info( "google_plus #{link_url} for #{url}")
- _put(result,:google_plus,link_url,log)
- end
+ log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
end
- result[:url] = url
- result[:success] = true
- result[:message] = ''
- rescue Exception => e
- result[:url] = url
- result[:success] = false
- result[:message] = "#{e}"
+ return status
end
- return result
- end
- def self.crawl( domain_list_filename, output_list_filename, status_filename=nil , log=nil)
- log = Logger.new(STDOUT) if log.nil?
- log.info( "Crawler started")
- status = Hash.new
- if not status_filename.nil? and File.exists?(status_filename)
- log.info( "Loading previous status from #{status_filename}")
- CSV.foreach( status_filename ) do |row|
- begin
+ def load_output_cache(output_list_filename, log=nil)
+ data = Hash.new()
+ log.info("Loading previous status from #{output_list_filename}")
+ if not File.exist?(output_list_filename)
+ return data
+ end
+ CSV.foreach(output_list_filename) do |row|
+ log.info("Loading #{row} #{row.count}")
+ if row.count < 5
+ next
+ end
url = row[0]
- result = row[1]
- message = row[2]
- status[url] = {
+ title= row[1]
+ twitter = row[2]
+ facebook = row[3]
+ google_plus = row[4]
+ data[url] = {
:url => url,
- :result => result,
- :message => message
+ :title => title,
+ :twitter => twitter,
+ :facebook => facebook,
+ :google_plus => google_plus
}
- rescue Exception => e
- log.info("Exception reading file #{e}")
- end
+ log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
end
- log.info( "Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
+ return data
end
- data = Hash.new()
- if File.exist?(output_list_filename)
- log.info( "Loading previous status from #{output_list_filename}")
- CSV.open( output_list_filename ) do |row|
- if row.count >= 5
- url = row[0]
- title= row[1]
- twitter = row[2]
- facebook = row[3]
- google_plus = row[4]
- data[url] = {
- :url => url,
- :title => title,
- :twitter => twitter,
- :facebook => facebook,
- :google_plus => google_plus
- }
- end
+ def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil)
+ log = Logger.new(STDOUT) if log.nil?
+ log.info("Crawler started")
+
+ status = load_status_cache(status_filename, log)
+
+ data = load_output_cache(output_list_filename, log)
+
+ CSV.open(output_list_filename, "wb") do |output|
+ data.each do |k, v|
+ output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
+ end
+ CSV.open(status_filename, "wb") do |status_line|
+ status.each do |k, v|
+ status_line << [k, v[:success], v[:message]]
+ end
+ crawl_loop(data, domain_list_filename, log, output, status, status_line)
+ end
end
- log.info( "Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
end
- CSV.foreach( domain_list_filename ) do |row|
+ def crawl_loop(data, domain_list_filename, log, output, status, status_line)
+ CSV.foreach(domain_list_filename) do |row|
url = row[0]
if status.has_key?(url)
- # already visited, skip
- else
- result = crawl_url(url,log)
- if result[:success] == true
- CSV.open( output_list_filename, "wb") do |output|
- data.each do |k,v|
- log.info(k)
- log.info(v)
- output << [k,v[:title],v[:twitter],v[:facebook],v[:google_plus]]
- end
- output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
- data[url] = result
- end
- status[url] = {
- :url => url,
- :result => 'success',
- :message => ''
- }
- CSV.open( status_filename, "wb" ) do |status_line|
- status_line << [url,'success','']
- end
- else
- status[url] = {
- :url => url,
- :result => result[:success],
- :message => result[:message]
- }
- CSV.open( status_filename, "wb" ) do |status_line|
- status_line << [url,result[:success],result[:message]]
- end
+ next
end
+ result = crawl_url(url, log)
+ if result[:success] == true
+ data[url] = result
+ output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
+ end
+ status[url] = {
+ :url => url,
+ :result => result[:success],
+ :message => result[:message]
+ }
+ status_line << [url, result[:success], result[:message]]
end
end
end
end
if __FILE__ == $0
- SocialCrawler.crawl(ARGV[0],ARGV[1],ARGV[2])
+ #:nocov:
+ SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
+ #:nocov:
end
\ No newline at end of file