lib/twitterscraper/query.rb in twitterscraper-ruby-0.3.0 vs lib/twitterscraper/query.rb in twitterscraper-ruby-0.4.0
- old
+ new
@@ -23,11 +23,11 @@
INIT_URL_USER = 'https://twitter.com/{u}'
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/{u}/timeline/tweets?' +
'include_available_features=1&include_entities=1&' +
'max_position={pos}&reset_error_state=false'
- def get_query_url(query, lang, pos, from_user = false)
+ def build_query_url(query, lang, pos, from_user = false)
# if from_user
# if !pos
# INIT_URL_USER.format(u = query)
# else
# RELOAD_URL_USER.format(u = query, pos = pos)
@@ -38,56 +38,49 @@
else
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
end
end
- def query_single_page(query, lang, pos, retries = 30, from_user = false, timeout = 3, headers: [], proxies: [])
+ def get_single_page(url, headers, proxies, timeout = 10, retries = 30)
+ Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
+ rescue => e
+ logger.debug "query_single_page: #{e.inspect}"
+ if (retries -= 1) > 0
+ logger.info("Retrying... (Attempts left: #{retries - 1})")
+ retry
+ else
+ raise
+ end
+ end
+
+ def parse_single_page(text, html = true)
+ if html
+ json_resp = nil
+ items_html = text
+ else
+ json_resp = JSON.parse(text)
+ items_html = json_resp['items_html'] || ''
+ logger.debug json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
+ end
+
+ [items_html, json_resp]
+ end
+
+ def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
logger.info("Querying #{query}")
- url = get_query_url(query, lang, pos, from_user)
+ url = build_query_url(query, lang, pos, from_user)
logger.debug("Scraping tweets from #{url}")
- response = nil
- begin
- proxy = proxies.sample
- logger.info("Using proxy #{proxy}")
+ response = get_single_page(url, headers, proxies)
+ html, json_resp = parse_single_page(response, pos.nil?)
- response = Twitterscraper::Http.get(url, headers, proxy, timeout)
- rescue => e
- logger.debug "query_single_page: #{e.inspect}"
- if (retries -= 1) > 0
- logger.info("Retrying... (Attempts left: #{retries - 1})")
- retry
- else
- raise
- end
- end
-
- html = ''
- json_resp = nil
-
- if pos
- begin
- json_resp = JSON.parse(response)
- html = json_resp['items_html'] || ''
- rescue => e
- logger.warn("Failed to parse JSON #{e.inspect} while requesting #{url}")
- end
- else
- html = response || ''
- end
-
tweets = Tweet.from_html(html)
if tweets.empty?
- if json_resp && json_resp['has_more_items']
- pos = json_resp['min_position']
- else
- pos = nil
- end
- return [], pos
+ return [], (json_resp && json_resp['has_more_items'] && json_resp['min_position'])
end
if json_resp
[tweets, json_resp['min_position']]
elsif from_user
@@ -101,20 +94,19 @@
start_date = start_date ? Date.parse(start_date) : Date.parse('2006-3-21')
end_date = end_date ? Date.parse(end_date) : Date.today
if start_date == end_date
raise 'Please specify different values for :start_date and :end_date.'
elsif start_date > end_date
- raise 'The :start_date must occur before :end_date.'
+ raise ':start_date must occur before :end_date.'
end
# TODO parallel
pos = nil
all_tweets = []
proxies = Twitterscraper::Proxy.get_proxies
- logger.info "Using #{proxies.size} proxies"
headers = {'User-Agent': USER_AGENT, 'X-Requested-With': 'XMLHttpRequest'}
logger.info("Headers #{headers}")
start_date.upto(end_date) do |date|
@@ -122,24 +114,25 @@
queries = query + " since:#{date} until:#{date + 1}"
while true
new_tweets, new_pos = query_single_page(queries, lang, pos, headers: headers, proxies: proxies)
- logger.info("Got #{new_tweets.size} tweets")
- logger.debug("new_pos=#{new_pos}")
-
unless new_tweets.empty?
all_tweets.concat(new_tweets)
all_tweets.uniq! { |t| t.tweet_id }
end
+ logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
break unless new_pos
break if all_tweets.size >= limit
pos = new_pos
end
- break if all_tweets.size >= limit
+ if all_tweets.size >= limit
+ logger.info("Reached limit #{all_tweets.size}")
+ break
+ end
end
all_tweets
end
end