lib/twitterscraper/query.rb in twitterscraper-ruby-0.11.0 vs lib/twitterscraper/query.rb in twitterscraper-ruby-0.12.0

- old
+ new

@@ -42,18 +42,22 @@ end end def get_single_page(url, headers, proxies, timeout = 6, retries = 30) return nil if stop_requested? - Twitterscraper::Http.get(url, headers, proxies.sample, timeout) + unless proxies.empty? + proxy = proxies.sample + logger.info("Using proxy #{proxy}") + end + Http.get(url, headers, proxy, timeout) rescue => e logger.debug "query_single_page: #{e.inspect}" if (retries -= 1) > 0 - logger.info("Retrying... (Attempts left: #{retries - 1})") + logger.info "Retrying... (Attempts left: #{retries - 1})" retry else - raise + raise Error.new("#{e.inspect} url=#{url}") end end def parse_single_page(text, html = true) return [nil, nil] if text.nil? || text == '' @@ -69,31 +73,31 @@ [items_html, json_resp] end def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: []) - logger.info("Querying #{query}") + logger.info "Querying #{query}" query = ERB::Util.url_encode(query) url = build_query_url(query, lang, pos, from_user) http_request = lambda do - logger.debug("Scraping tweets from #{url}") + logger.debug "Scraping tweets from #{url}" get_single_page(url, headers, proxies) end if cache_enabled? client = Cache.new if (response = client.read(url)) - logger.debug('Fetching tweets from cache') + logger.debug 'Fetching tweets from cache' else response = http_request.call - client.write(url, response) + client.write(url, response) unless stop_requested? end else response = http_request.call end - return [], nil if response.nil? + return [], nil if response.nil? || response.empty? html, json_resp = parse_single_page(response, pos.nil?) tweets = Tweet.from_html(html) @@ -112,35 +116,35 @@ OLDEST_DATE = Date.parse('2006-03-21') def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:) if query.nil? || query == '' - raise 'Please specify a search query.' + raise Error.new('Please specify a search query.') end if ERB::Util.url_encode(query).length >= 500 - raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.' + raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.') end if start_date && end_date if start_date == end_date - raise 'Please specify different values for :start_date and :end_date.' + raise Error.new('Please specify different values for :start_date and :end_date.') elsif start_date > end_date - raise ':start_date must occur before :end_date.' + raise Error.new(':start_date must occur before :end_date.') end end if start_date if start_date < OLDEST_DATE - raise ":start_date must be greater than or equal to #{OLDEST_DATE}" + raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}") end end if end_date today = Date.today if end_date > Date.today - raise ":end_date must be less than or equal to today(#{today})" + raise Error.new(":end_date must be less than or equal to today(#{today})") end end end def build_queries(query, start_date, end_date) @@ -154,64 +158,73 @@ else [query] end end - def main_loop(query, lang, limit, headers, proxies) + def main_loop(query, lang, limit, daily_limit, headers, proxies) pos = nil + daily_tweets = [] while true new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies) unless new_tweets.empty? + daily_tweets.concat(new_tweets) + daily_tweets.uniq! { |t| t.tweet_id } + @mutex.synchronize { @all_tweets.concat(new_tweets) @all_tweets.uniq! { |t| t.tweet_id } } end - logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})") + logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})" break unless new_pos + break if daily_limit && daily_tweets.size >= daily_limit break if @all_tweets.size >= limit pos = new_pos end - if @all_tweets.size >= limit - logger.info("Limit reached #{@all_tweets.size}") + if !@stop_requested && @all_tweets.size >= limit + logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}" @stop_requested = true end end def stop_requested? @stop_requested end - def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false) + def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false) start_date = Date.parse(start_date) if start_date && start_date.is_a?(String) end_date = Date.parse(end_date) if end_date && end_date.is_a?(String) queries = build_queries(query, start_date, end_date) threads = queries.size if threads > queries.size - proxies = proxy ? Twitterscraper::Proxy::Pool.new : [] + proxies = proxy ? Proxy::Pool.new : [] validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy) - logger.info("The number of threads #{threads}") + logger.debug "Fetch #{proxies.size} proxies" if proxy + logger.info "The number of threads #{threads}" headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'} - logger.info("Headers #{headers}") + logger.info "Headers #{headers}" @all_tweets = [] @mutex = Mutex.new @stop_requested = false if threads > 1 + Thread.abort_on_exception = true + logger.debug "Set 'Thread.abort_on_exception' to true" + Parallel.each(queries, in_threads: threads) do |query| - main_loop(query, lang, limit, headers, proxies) + main_loop(query, lang, limit, daily_limit, headers, proxies) raise Parallel::Break if stop_requested? end else queries.each do |query| - main_loop(query, lang, limit, headers, proxies) + main_loop(query, lang, limit, daily_limit, headers, proxies) break if stop_requested? end end @all_tweets.sort_by { |tweet| -tweet.created_at.to_i }