#!/usr/bin/env ruby require 'rubygems' require 'open-uri' require 'optparse' require 'time' require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv')) options = { :fields => %w[created_at text] } parser = OptionParser.new do |opts| opts.banner = "Usage: #{File.basename($0)} [options]" opts.separator "" opts.separator "Specific options:" opts.on("-u", "--username USERNAME", "Twitter username") do |username| options[:username] = username end opts.on("-p", "--password PASSWORD", "Twitter password") do |password| options[:password] = password end opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv| options[:csv_appending] = File.exists?(csv) options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a') end opts.on("-j", "--json FILE", "The JSON file to append to, or - for STDOUT") do |json| options[:json] = json == "-" ? STDOUT : File.open(json, 'a') end opts.on("-f", "--filter KEYWORDS", "Keywords to ask Twitter to filter on") do |filter| options[:filter] = filter.split(/\s*,\s*/) end opts.on("-x", "--fields FIELDS", "Fields to include in the CSV") do |fields| options[:fields] = fields.split(/\s*,\s*/) end opts.on("--date-fields FIELD_NAMES", "Break these fields into separate numerical columns for weekday, day, month, your, hour, minute, and second.") do |date_fields| options[:date_fields] = date_fields.split(/\s*,\s*/) end opts.on("-e", "--require-english", "Attempt to filter out non-English tweets.", "This will have both false positives and false negatives.") do |e| options[:require_english] = e end opts.on("-v", "--[no-]verbose", "Run verbosely") do |v| options[:verbose] = v end opts.on("-r", "--replay-from-file FILENAME", "Replay tweets from a JSON dump file") do |replay_file| options[:replay_from_file] = replay_file end opts.on("--analyze-gaps MINUTES", "Look at the stream and display gap information for gaps longer than MINUTES") do |gap_minutes| options[:analyze_gaps] = gap_minutes && gap_minutes.to_i end opts.on("--sample-fields NUMBER_OF_SAMPLES", "Record NUMBER_OF_SAMPLES tweets and then print out all","of the field names seen. Use to find out what can be passed to.") do |samples| options[:sample_fields] = samples && samples.to_i end opts.on("--url-columns NUM_COLUMNS", "Extract up to NUM_COLUMNS urls from the status and include them in the CSV") do |url_columns| options[:url_columns] = url_columns.to_i end opts.on("--hash-columns NUM_COLUMNS", "Extract up to NUM_COLUMNS hashtags (#foo) from the status and include them in the CSV") do |hashtag_columns| options[:hashtag_columns] = hashtag_columns.to_i end opts.on("--user-columns NUM_COLUMNS", "Extract up to NUM_COLUMNS user mentions (@foo) from the status and include them in the CSV") do |user_mention_columns| options[:user_mention_columns] = user_mention_columns.to_i end opts.on("-s", "--compute-sentiment", "Compute an average sentiment score for each status using the AFINN-111 sentiment dictionary") do |compute_sentiment| options[:compute_sentiment] = compute_sentiment end opts.on("--compute-word-count", "Include a word count for each status in the output CSV") do |compute_word_count| options[:compute_word_count] = compute_word_count end opts.on("--normalize-source", "Return just the domain name from the Tweet source (i.e., tweetdeck, facebook)") do |normalize_source| options[:normalize_source] = normalize_source end opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time| options[:start_time] = Time.parse(start_time) end opts.on("--end TIME", "Ignore tweets with a created_at later than TIME") do |end_time| options[:end_time] = Time.parse(end_time) end opts.on_tail("-h", "--help", "Show this message") do STDERR.puts opts exit end opts.on_tail("--version", "Show version") do STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}" exit end opts.separator "" opts.separator "If you would like to do special retweet handling, use the following options." opts.separator "For these to function, you must be using --replay-from-file. The replay will be performed in reverse." opts.on("--retweet-mode MODE", "Determine how to handle retweets", "Options are just 'ROLLUP'") do |retweet_mode| options[:retweet_mode] = retweet_mode.downcase.to_sym end opts.on("--retweet-threshold COUNT", "Only consider statuses with at least COUNT retweets") do |retweet_threshold| options[:retweet_threshold] = retweet_threshold.to_i end opts.on("--retweet-window WINDOW", "Ignore retweets that occur beyond WINDOW days", "Additionally, statuses where WINDOW days have not yet passed will be ignored.") do |retweet_window| options[:retweet_window] = retweet_window.to_i end opts.on("--retweet-counts-at HOURS", "Output the number of retweets seen at specific times after the original tweet") do |retweet_counts_at| options[:retweet_counts_at] = retweet_counts_at.split(",").map(&:to_f) end end parser.parse! unless (options[:username] && options[:password]) || options[:replay_from_file] STDERR.puts "Error: Twitter username and password are required fields unless you're replaying from a file.\n\n" STDERR.puts parser exit 1 end TwitterToCsv::CsvBuilder.new(options).run