lib/twitterscraper/query.rb in twitterscraper-ruby-0.11.0 vs lib/twitterscraper/query.rb in twitterscraper-ruby-0.12.0
- old
+ new
@@ -42,18 +42,22 @@
end
end
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
return nil if stop_requested?
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
+ unless proxies.empty?
+ proxy = proxies.sample
+ logger.info("Using proxy #{proxy}")
+ end
+ Http.get(url, headers, proxy, timeout)
rescue => e
logger.debug "query_single_page: #{e.inspect}"
if (retries -= 1) > 0
- logger.info("Retrying... (Attempts left: #{retries - 1})")
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
retry
else
- raise
+ raise Error.new("#{e.inspect} url=#{url}")
end
end
def parse_single_page(text, html = true)
return [nil, nil] if text.nil? || text == ''
@@ -69,31 +73,31 @@
[items_html, json_resp]
end
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
- logger.info("Querying #{query}")
+ logger.info "Querying #{query}"
query = ERB::Util.url_encode(query)
url = build_query_url(query, lang, pos, from_user)
http_request = lambda do
- logger.debug("Scraping tweets from #{url}")
+ logger.debug "Scraping tweets from #{url}"
get_single_page(url, headers, proxies)
end
if cache_enabled?
client = Cache.new
if (response = client.read(url))
- logger.debug('Fetching tweets from cache')
+ logger.debug 'Fetching tweets from cache'
else
response = http_request.call
- client.write(url, response)
+ client.write(url, response) unless stop_requested?
end
else
response = http_request.call
end
- return [], nil if response.nil?
+ return [], nil if response.nil? || response.empty?
html, json_resp = parse_single_page(response, pos.nil?)
tweets = Tweet.from_html(html)
@@ -112,35 +116,35 @@
OLDEST_DATE = Date.parse('2006-03-21')
def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
if query.nil? || query == ''
- raise 'Please specify a search query.'
+ raise Error.new('Please specify a search query.')
end
if ERB::Util.url_encode(query).length >= 500
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
end
if start_date && end_date
if start_date == end_date
- raise 'Please specify different values for :start_date and :end_date.'
+ raise Error.new('Please specify different values for :start_date and :end_date.')
elsif start_date > end_date
- raise ':start_date must occur before :end_date.'
+ raise Error.new(':start_date must occur before :end_date.')
end
end
if start_date
if start_date < OLDEST_DATE
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
end
end
if end_date
today = Date.today
if end_date > Date.today
- raise ":end_date must be less than or equal to today(#{today})"
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
end
end
end
def build_queries(query, start_date, end_date)
@@ -154,64 +158,73 @@
else
[query]
end
end
- def main_loop(query, lang, limit, headers, proxies)
+ def main_loop(query, lang, limit, daily_limit, headers, proxies)
pos = nil
+ daily_tweets = []
while true
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
unless new_tweets.empty?
+ daily_tweets.concat(new_tweets)
+ daily_tweets.uniq! { |t| t.tweet_id }
+
@mutex.synchronize {
@all_tweets.concat(new_tweets)
@all_tweets.uniq! { |t| t.tweet_id }
}
end
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
break unless new_pos
+ break if daily_limit && daily_tweets.size >= daily_limit
break if @all_tweets.size >= limit
pos = new_pos
end
- if @all_tweets.size >= limit
- logger.info("Limit reached #{@all_tweets.size}")
+ if !@stop_requested && @all_tweets.size >= limit
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
@stop_requested = true
end
end
def stop_requested?
@stop_requested
end
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false)
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
queries = build_queries(query, start_date, end_date)
threads = queries.size if threads > queries.size
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
+ proxies = proxy ? Proxy::Pool.new : []
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
- logger.info("The number of threads #{threads}")
+ logger.debug "Fetch #{proxies.size} proxies" if proxy
+ logger.info "The number of threads #{threads}"
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
- logger.info("Headers #{headers}")
+ logger.info "Headers #{headers}"
@all_tweets = []
@mutex = Mutex.new
@stop_requested = false
if threads > 1
+ Thread.abort_on_exception = true
+ logger.debug "Set 'Thread.abort_on_exception' to true"
+
Parallel.each(queries, in_threads: threads) do |query|
- main_loop(query, lang, limit, headers, proxies)
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
raise Parallel::Break if stop_requested?
end
else
queries.each do |query|
- main_loop(query, lang, limit, headers, proxies)
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
break if stop_requested?
end
end
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }