script/collect_twss.rb in twss-0.0.3 vs script/collect_twss.rb in twss-0.0.4

- old
+ new

@@ -1,4 +1,24 @@ -require File.expand_path('../lib/twss', File.dirname(__FILE__)) -require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__)) +require 'rubygems' +require 'open-uri' +require 'hpricot' -TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run +# Grab the first 2000 stories from twssstories.com (10 per page) + +f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w") + +domain = "http://twssstories.com" +200.times do |i| + url = domain + "/node?page=#{i}" + puts url + doc = Hpricot(open(url).read) + doc.search('div.content p') do |story| + # now pull out the good stuff... + if story.to_plain_text =~ /\"(.*)?\"/ + f.puts $1 + end + end + f.flush + sleep rand * 3.0 +end + +f.close