Sha256: ff3051d5e963ae70ffce4c36518897d73568f7c189e2c94e8e7ce22d7056ddca

Contents?: true

Size: 1.62 KB

Versions: 2

Compression:

Stored size: 1.62 KB

Contents

class Twexicon::Scraper
  attr_reader :tweets

  def initialize(username)
    @tweets = {}
    scrape_tweets(username)
  end

  def scrape_tweets(username)
    Nokogiri::HTML(open("https://twitter.com/#{username}")).css(".tweet-text").each{|t| tweets[tweets.length+1] = {t.text => {:pix => [], :links => [], :hashtags => [], :usernames => [], :numbers => [], :acronyms => [], :shouts => [], :words => []}}}
  end

  def refine_tweets
    tweets.each do |num, tweet|
      t = tweet.keys[0].dup
      t.scan(/pic.twitter.com\/\w{10}/){|p| tweet.values[0][:pix] << p.strip}.gsub!(/pic.twitter.com\/\w{10}/, " ")
      t.scan(/https?:\/\/[\w\.\?\=\&\-\/\#]+/){|w| tweet.values[0][:links] << w.strip}.gsub!(/https?:\/\/[\w\.\?\=\&\-\/\#]+/, " ")
      t.scan(/#\w+/){|h| tweet.values[0][:hashtags] << h.gsub(/\W/, "").prepend("#")}.gsub!(/#\w+/, " ")
      t.scan(/@\w+/){|u| tweet.values[0][:usernames] << u.gsub(/\W/, "").prepend("@")}.gsub!(/@\w+/, " ")
      t.scan(/(\d+[:\.\b]?\d*)+/){|n| tweet.values[0][:numbers] << n.first.gsub(/(^\W+|\W+$)/, "")}.gsub!(/(\d+[:\.\b]?\d*)+/, " ")
      t.scan(/(\b[A-Z][\.\b][A-Z][\.\b][A-Z][\.\b]|\b[A-Z][\.\b][A-Z][\.\b])/){|a| tweet.values[0][:acronyms] << a.first.strip}.gsub!(/(\b[A-Z][\.\b][A-Z][\.\b][A-Z][\.\b]|\b[A-Z][\.\b][A-Z][\.\b])/, " ")
      t.scan(/(([A-Z]+\W){2,}|[A-Z]{4,}\W)/){|s| tweet.values[0][:shouts] << s.first.gsub(/\W/, " ").strip}.gsub!(/(([A-Z]+\W){2,}|[A-Z]{4,}\W)/, " ")
      t.scan(/\b[A-Z]{2,3}\b/){|a| tweet.values[0][:acronyms] << a.strip}.gsub!(/\b[A-Z]{2,3}\b/, " ")
      t.scan(/\w+['\/]?\w*/){|w| tweet.values[0][:words] << w.strip}.gsub!(/\w+['\/]?\w*/, " ")
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
twexicon-0.1.7.1 lib/twexicon/scraper.rb
twexicon-0.1.6 lib/twexicon/scraper.rb