script/collect_twss.rb in twss-0.0.3 vs script/collect_twss.rb in twss-0.0.4
- old
+ new
@@ -1,4 +1,24 @@
-require File.expand_path('../lib/twss', File.dirname(__FILE__))
-require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
+require 'rubygems'
+require 'open-uri'
+require 'hpricot'
-TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run
+# Grab the first 2000 stories from twssstories.com (10 per page)
+
+f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w")
+
+domain = "http://twssstories.com"
+200.times do |i|
+ url = domain + "/node?page=#{i}"
+ puts url
+ doc = Hpricot(open(url).read)
+ doc.search('div.content p') do |story|
+ # now pull out the good stuff...
+ if story.to_plain_text =~ /\"(.*)?\"/
+ f.puts $1
+ end
+ end
+ f.flush
+ sleep rand * 3.0
+end
+
+f.close