collect_twss.rb in twss-0.0.4

- old
+ new
@@ -1,4 +1,24 @@
-require File.expand_path('../lib/twss', File.dirname(__FILE__))
-require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
+require 'rubygems'
+require 'open-uri'
+require 'hpricot'
 
-TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run
+# Grab the first 2000 stories from twssstories.com (10 per page)
+
+f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w")
+
+domain = "http://twssstories.com"
+200.times do |i|
+  url = domain + "/node?page=#{i}"
+  puts url
+  doc = Hpricot(open(url).read)
+  doc.search('div.content p') do |story|
+    # now pull out the good stuff...
+    if story.to_plain_text =~ /\"(.*)?\"/
+      f.puts $1
+    end
+  end
+  f.flush
+  sleep rand * 3.0
+end
+
+f.close