#!/usr/bin/env ruby require 'rubygems' require 'open-uri' require 'optparse' require 'time' require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'twitter_to_csv')) options = { :fields => %w[created_at text], :bool_word_fields => [] } parser = OptionParser.new do |opts| opts.banner = "Usage: #{File.basename($0)} [options]" opts.separator "" opts.separator "These four fields are required. Please see the README to learn how to get them for your Twitter account." opts.on("--api-key KEY", "Twitter API key") do |api_key| options[:api_key] = api_key end opts.on("--api-secret SECRET", "Twitter API secret") do |api_secret| options[:api_secret] = api_secret end opts.on("--access-token TOKEN", "Twitter access token") do |access_token| options[:access_token] = access_token end opts.on("--access-token-secret SECRET", "Twitter access token secret") do |access_token_secret| options[:access_token_secret] = access_token_secret end opts.separator "" opts.separator "General settings:" opts.on("-c", "--csv FILE", "The CSV file to append to, or - for STDOUT") do |csv| options[:csv_appending] = File.exists?(csv) options[:csv] = csv == "-" ? STDOUT : File.open(csv, 'a') end opts.on("-j", "--json FILE", "The JSON file to append to, or - for STDOUT") do |json| options[:json] = json == "-" ? STDOUT : File.open(json, 'a') end opts.on("-f", "--filter KEYWORDS", "Keywords to ask Twitter to filter on") do |filter| options[:filter] = filter.split(/\s*,\s*/) end opts.on("-x", "--fields FIELDS", "Fields to include in the CSV") do |fields| options[:fields] = fields.split(/\s*,\s*/) end opts.on("--date-fields FIELD_NAMES", "Break these fields into separate numerical columns for weekday, day, month, your, hour, minute, and second.") do |date_fields| options[:date_fields] = date_fields.split(/\s*,\s*/) end opts.on("-e", "--require-english [STRATEGY]", "Attempt to filter out non-English tweets. This will have both false positives and false negatives.", "The strategy can be either 'uld' to use the UnsupervisedLanguageDetection Ruby gem,", "'lang' to use Twitter's guessed 'lang' attribute, or 'both' to only remove tweets that", "both Twitter and ULD think are non-English. This is most conservative and is the default.") do |e| options[:require_english] = (e || "both").downcase.to_sym end opts.on("-v", "--[no-]verbose", "Run verbosely") do |v| options[:verbose] = v end opts.on("-r", "--replay-from-file FILENAME", "Replay tweets from a JSON dump file") do |replay_file| options[:replay_from_file] = replay_file end opts.on("--analyze-gaps MINUTES", "Look at the stream and display gap information for gaps longer than MINUTES") do |gap_minutes| options[:analyze_gaps] = gap_minutes && gap_minutes.to_i end opts.on("--sample-fields NUMBER_OF_SAMPLES", "Record NUMBER_OF_SAMPLES tweets and then print out all","of the field names seen. Use to find out what can be passed to.") do |samples| options[:sample_fields] = samples && samples.to_i end opts.on("--url-columns NUM_COLUMNS", "Extract up to NUM_COLUMNS urls from the status and include them in the CSV") do |url_columns| options[:url_columns] = url_columns.to_i end opts.on("--hash-columns NUM_COLUMNS", "Extract up to NUM_COLUMNS hashtags (#foo) from the status and include them in the CSV") do |hashtag_columns| options[:hashtag_columns] = hashtag_columns.to_i end opts.on("--user-columns NUM_COLUMNS", "Extract up to NUM_COLUMNS user mentions (@foo) from the status and include them in the CSV") do |user_mention_columns| options[:user_mention_columns] = user_mention_columns.to_i end opts.on("-s", "--compute-sentiment", "Compute an average sentiment score for each status using the AFINN-111 sentiment dictionary") do |compute_sentiment| options[:compute_sentiment] = compute_sentiment end opts.on("--compute-word-count", "Include a word count for each status in the output CSV") do |compute_word_count| options[:compute_word_count] = compute_word_count end opts.on("--normalize-source", "Return just the domain name from the Tweet source (i.e., tweetdeck, facebook)") do |normalize_source| options[:normalize_source] = normalize_source end opts.on("--remove-quotes", "This option strips all double quotes from the output to help some CSV parsers.") do |remove_quotes| options[:remove_quotes] = remove_quotes end opts.on("--prefix-ids", "Prefix any field ending in _id or _id_str with 'id' to force parsing as a string in some programs.") do |prefix_ids| options[:prefix_ids] = prefix_ids end opts.on("-w", "--bool-word-field \"NAME:WORD AND WORD AND WORD\"", "Create a named CSV column that is true when the word expression matches, false otherwise.", "Word expressions are boolean expressions where neighboring words must occur sequentially", "and you can use parentheses, AND, and OR to test for occurrence relationships. Examples:", " keyword_any:tanning booth OR tanning booths OR tanningbooth", " keyword_both:tanning AND booth", " keyword_complex:tanning AND (booth OR bed)", "This option can be used multiple times.") do |bool_word_field| options[:bool_word_fields] << TwitterToCsv::BoolWordFieldParser.parse(bool_word_field) end opts.on("--start TIME", "Ignore tweets with a created_at earlier than TIME") do |start_time| options[:start_time] = Time.parse(start_time) end opts.on("--end TIME", "Ignore tweets with a created_at later than TIME") do |end_time| options[:end_time] = Time.parse(end_time) end opts.on_tail("-h", "--help", "Show this message") do STDERR.puts opts exit end opts.on_tail("--version", "Show version") do STDERR.puts "twitter_to_csv version #{TwitterToCsv::VERSION}" exit end opts.separator "" opts.separator "If you would like to do special retweet handling, use the following options." opts.separator "For these to function, you must be using --replay-from-file. The replay will be performed in reverse." opts.on("--retweet-mode MODE", "Determine how to handle retweets", "Options are just 'ROLLUP'") do |retweet_mode| options[:retweet_mode] = retweet_mode.downcase.to_sym end opts.on("--retweet-threshold COUNT", "Only consider statuses with at least COUNT retweets") do |retweet_threshold| options[:retweet_threshold] = retweet_threshold.to_i end opts.on("--retweet-window WINDOW", "Ignore retweets that occur beyond WINDOW days", "Additionally, statuses where WINDOW days have not yet passed will be ignored.") do |retweet_window| options[:retweet_window] = retweet_window.to_i end opts.on("--retweet-counts-at HOURS", "Output the number of retweets seen at specific times after the original tweet") do |retweet_counts_at| options[:retweet_counts_at] = retweet_counts_at.split(",").map(&:to_f) end end parser.parse! unless (options[:api_key] && options[:api_secret] && options[:access_token] && options[:access_token_secret]) || options[:replay_from_file] STDERR.puts "Error: The four Twitter credential fields are required unless you're replaying from a file.\n\n" STDERR.puts parser exit 1 end TwitterToCsv::CsvBuilder.new(options).run