lib/gscraper/search/query.rb in gscraper-0.1.7 vs lib/gscraper/search/query.rb in gscraper-0.2.0
- old
+ new
@@ -1,33 +1,42 @@
+#
+#--
+# GScraper - A web-scraping interface to various Google Services.
+#
+# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#++
+#
+
require 'gscraper/search/result'
require 'gscraper/search/page'
require 'gscraper/sponsored_ad'
require 'gscraper/sponsored_links'
require 'gscraper/extensions/uri'
+require 'gscraper/has_pages'
require 'gscraper/licenses'
-require 'gscraper/web_agent'
+require 'gscraper/gscraper'
require 'hpricot'
module GScraper
module Search
class Query
- include WebAgent
-
- # Search host
- SEARCH_HOST = 'www.google.com'
-
- # Search URL
- SEARCH_URL = "http://#{SEARCH_HOST}/search"
-
- # Default results per-page
- RESULTS_PER_PAGE = 10
-
- # Results per-page
- attr_accessor :results_per_page
-
# Search query
attr_accessor :query
# Search 'link' modifier
attr_accessor :link
@@ -69,71 +78,18 @@
attr_accessor :with_words
# Search for results with-out the words
attr_accessor :without_words
- # Search for results written in the language
- attr_accessor :language
-
- # Search for results from the region
- attr_accessor :region
-
- # Search for results in the format
- attr_accessor :in_format
-
- # Search for results not in the format
- attr_accessor :not_in_format
-
- # Search for results within the past day
- attr_accessor :within_past_day
-
- # Search for results within the past week
- attr_accessor :within_past_week
-
- # Search for results within the past months
- attr_accessor :within_past_months
-
- # Search for results within the past year
- attr_accessor :within_past_year
-
# Search for results containing numbers between the range
attr_accessor :numeric_range
- # Search for results where the query ocurrs within the area
- attr_accessor :occurrs_within
-
- # Search for results inside the domain
- attr_accessor :inside_domain
-
- # Search for results outside the domain
- attr_accessor :outside_domain
-
- # Search for results which have the rights
- attr_accessor :rights
-
- # Filter the search results
- attr_accessor :filtered
-
- # Search for results similar to the page
- attr_accessor :similar_to
-
- # Search for results linking to the page
- attr_accessor :links_to
-
#
# Creates a new Query object from the given search options. If a
- # block is given, it will be passed the newly created query object.
+ # block is given, it will be passed the newly created Query object.
#
- # Query.new(:query => 'ruby', :with_words => 'sow rspec')
- #
- # Query.new(:exact_phrase => 'fluent interfaces') do |q|
- # q.within_past_week = true
- # end
- #
def initialize(options={},&block)
- @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
-
@query = options[:query]
@link = options[:link]
@related = options[:related]
@info = options[:info]
@@ -149,435 +105,68 @@
@exact_phrase = options[:exact_phrase]
@with_words = options[:with_words]
@without_words = options[:without_words]
- @language = options[:language]
- @region = options[:region]
- @in_format = options[:in_format]
- @not_in_format = options[:not_in_format]
-
- if options[:within_past_day]
- @within_past_day = options[:within_past_day]
- @within_past_week = false
- @within_past_months = false
- @within_past_year = false
- elsif options[:within_past_week]
- @within_past_day = false
- @within_past_week = options[:within_past_week]
- @within_past_months = false
- @within_past_year = false
- elsif options[:within_past_months]
- @within_past_day = false
- @within_past_week = false
- @within_past_months = options[:within_past_months]
- @within_past_year = false
- elsif options[:within_past_year]
- @within_past_day = false
- @within_past_week = false
- @within_past_months = false
- @within_past_year = options[:within_past_year]
- else
- @within_past_day = false
- @within_past_week = false
- @within_past_months = false
- @within_past_year = false
- end
-
@numeric_range = options[:numeric_range]
- @occurrs_within = options[:occurrs_within]
- @inside_domain = options[:inside_domain]
- @outside_domain = options[:outside_domain]
- @rights = options[:rights]
- @filtered = options[:filtered]
- @similar_to = options[:similar_to]
- @links_to = options[:links_to]
-
block.call(self) if block
end
#
- # Creates a new Query object from the specified URL. If a block is
- # given, it will be passed the newly created Query object.
+ # Returns the query expression.
#
- # Query.from_url('http://www.google.com/search?q=ruby+zen)
- #
- # Query.from_url('http://www.google.com/search?q=ruby') do |q|
- # q.within_last_month = true
- # q.occurrs_within = :title
- # end
- #
- def self.from_url(url,options={},&block)
- url = URI.parse(url)
+ def expression
+ expr = []
- options[:results_per_page] = url.query_params['num']
-
- options[:query] = url.query_params['as_q']
- options[:exact_phrase] = url.query_params['as_epq']
- options[:with_words] = url.query_params['as_oq']
- options[:without_words] = url.query_params['as_eq']
-
- options[:language] = url.query_params['lr']
- options[:region] = url.query_params['cr']
-
- case url.query_params['as_ft']
- when 'i'
- options[:in_format] = url.query_params['as_filetype']
- when 'e'
- options[:not_in_format] = url.query_params['as_filetype']
- end
-
- case url.query_params['as_qdr']
- when 'd'
- options[:within_past_day] = true
- when 'w'
- options[:within_past_week] = true
- when 'm'
- options[:within_past_months] = 1
- when 'm2'
- options[:within_past_months] = 2
- when 'm3'
- options[:within_past_months] = 3
- when 'm6'
- options[:within_past_months] = 6
- when 'y'
- options[:within_past_year] = true
- end
-
- if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
- options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
- end
-
- case url.query_params['as_occt']
- when 'title'
- options[:occurrs_within] = :title
- when 'body'
- options[:occurrs_within] = :body
- when 'url'
- options[:occurrs_within] = :url
- when 'links'
- options[:occurrs_within] = :links
- end
-
- case url.query_params['as_dt']
- when 'i'
- options[:inside_domain] = url.query_params['as_sitesearch']
- when 'e'
- options[:outside_domain] = url.query_params['as_sitesearch']
- end
-
- case url.query_params['as_rights']
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
- options[:rights] = Licenses::CC_BY_NC_ND
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
- options[:rights] = Licenses::CC_BY_SA
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
- options[:rights] = Licenses::CC_BY_NC
- when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
- options[:rights] = Licenses::CC_BY
- end
-
- if url.query_params[:safe]=='active'
- options[:filtered] = true
- end
-
- if url.query_params['as_rq']
- options[:similar_to] = url.query_params['as_rq']
- elsif url.query_params['as_lq']
- options[:links_to] = url.query_params['as_lq']
- end
-
- return self.new(options,&block)
- end
-
- #
- # Returns the URL that represents the query.
- #
- def search_url
- url = URI(SEARCH_URL)
- query_expr = []
-
- set_param = lambda { |param,value|
- url.query_params[param.to_s] = value if value
- }
-
append_modifier = lambda { |name|
modifier = instance_variable_get("@#{name}")
- query_expr << "#{name}:#{modifier}" if modifier
+ expr << "#{name}:#{modifier}" if modifier
}
- join_ops = lambda { |name|
+ append_options = lambda { |name|
ops = instance_variable_get("@#{name}")
if ops.kind_of?(Array)
- query_expr << "#{name}:#{ops.join(' ')}"
+ expr << "#{name}:#{ops.join(' ')}"
elsif ops
- query_expr << "#{name}:#{ops}"
+ expr << "#{name}:#{ops}"
end
}
- set_param.call('num',@results_per_page)
+ expr << @query if @query
- query_expr << @query if @query
-
append_modifier.call(:link)
append_modifier.call(:related)
append_modifier.call(:info)
append_modifier.call(:site)
append_modifier.call(:filetype)
- join_ops.call(:allintitle)
+ append_options.call(:allintitle)
append_modifier.call(:intitle)
- join_ops.call(:allinurl)
+ append_options.call(:allinurl)
append_modifier.call(:inurl)
- join_ops.call(:allintext)
+ append_options.call(:allintext)
append_modifier.call(:intext)
- unless query_expr.empty?
- url.query_params['as_q'] = query_expr.join(' ')
+ if @exact_phrase
+ expr << "\"#{@exact_phrase}\""
end
- set_param.call('as_epq',@exact_phrase)
- set_param.call('as_oq',@with_words)
- set_param.call('as_eq',@without_words)
-
- set_param.call('lr',@language)
- set_param.call('cr',@region)
-
- if @in_format
- url.query_params['as_ft'] = 'i'
- url.query_params['as_filtetype'] = @in_format
- elsif @not_in_format
- url.query_params['as_ft'] = 'e'
- url.query_params['as_filtetype'] = @not_in_format
+ if @with_words.kind_of?(Array)
+ expr << @with_words.join(' OR ')
end
-
- if @within_past_day
- url.query_params['as_qdr'] = 'd'
- elsif @within_past_week
- url.query_params['as_qdr'] = 'w'
- elsif @within_past_months
- case @within_past_months
- when 1
- url.query_params['as_qdr'] = 'm'
- when 2
- url.query_params['as_qdr'] = 'm2'
- when 3
- url.query_params['as_qdr'] = 'm3'
- when 6
- url.query_params['as_qdr'] = 'm6'
- end
- elsif @within_past_year
- url.query_params['as_qdr'] = 'y'
+
+ if @without_words.kind_of?(Array)
+ expr << @without_words.map { |word| "-#{word}" }.join(' ')
end
- if @numeric_range
- url.query_params['as_nlo'] = @numeric_range.begin
- url.query_params['as_nhi'] = @numeric_range.end
+ if @numeric_range.kind_of?(Range)
+ expr << "#{@numeric_range.begin}..#{@numeric_range.end}"
end
- case @occurrs_within
- when :title, 'title'
- url.query_params['as_occt'] = 'title'
- when :body, 'body'
- url.query_params['as_occt'] = 'body'
- when :url, 'url'
- url.query_params['as_occt'] = 'url'
- when :links, 'links'
- url.query_params['as_occt'] = 'links'
- end
-
- if @inside_domain
- url.query_params['as_dt'] = 'i'
- url.query_params['as_sitesearch'] = @inside_domain
- elsif @outside_domain
- url.query_params['as_dt'] = 'e'
- url.query_params['as_sitesearch'] = @outside_domain
- end
-
- case @rights
- when Licenses::CC_BY_NC_ND
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
- when Licenses::CC_BY_SA
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
- when Licenses::CC_BY_ND
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
- when Licenses::CC_BY
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
- end
-
- url.query_params['safe'] = true if @filtered
-
- if @similar_to
- url.query_params['as_rq'] = @similar_to
- elsif @links_to
- url.query_params['as_lq'] = @links_to
- end
-
- return url
- end
-
- #
- # Returns the URL that represents the query at the specific
- # _page_index_.
- #
- def page_url(page_index)
- url = search_url
-
- url.query_params['start'] = page_result_offset(page_index)
- url.query_params['sa'] = 'N'
-
- return url
- end
-
- #
- # Returns a Page object containing Result objects at the specified
- # _page_index_. If a _block_ is given, it will be passed the newly
- # created Page.
- #
- def page(page_index,&block)
- doc = get_page(page_url(page_index))
-
- new_page = Page.new
- results = doc.search('//div.g')[0...@results_per_page.to_i]
-
- results.each_with_index do |result,index|
- rank = page_result_offset(page_index) + (index + 1)
- link = result.at('//a.l')
- title = link.inner_text
- url = link.get_attribute('href')
- summary_text = ''
- cached_url = nil
- similar_url = nil
-
- if (content = (result.at('//td.j//font|//td.j/div.sml')))
- content.children.each do |elem|
- break if (!(elem.text?) && elem.name=='br')
-
- summary_text << elem.inner_text
- end
-
- if (cached_link = result.at('nobr/a:first'))
- cached_url = cached_link.get_attribute('href')
- end
-
- if (similar_link = result.at('nobr/a:last'))
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
- end
- end
-
- new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
- end
-
- block.call(new_page) if block
- return new_page
- end
-
- #
- # Returns the Results on the first page. If a _block_ is given it
- # will be passed the newly created Page.
- #
- def first_page(&block)
- page(1,&block)
- end
-
- #
- # Returns the Result at the specified _index_.
- #
- def result_at(index)
- page(result_page_index(index))[page_result_index(index)]
- end
-
- #
- # Returns the first Result on the first_page.
- #
- def top_result
- result_at(1)
- end
-
- #
- # Iterates over the results at the specified _page_index_, passing
- # each to the given _block_.
- #
- # query.each_on_page(2) do |result|
- # puts result.title
- # end
- #
- def each_on_page(page_index,&block)
- page(page_index).each(&block)
- end
-
- #
- # Iterates over the results on the first page, passing each to the
- # given _block_.
- #
- # query.each_on_first_page do |result|
- # puts result.url
- # end
- #
- def each_on_first_page(&block)
- each_on_page(1,&block)
- end
-
- #
- # Returns a SponsoredLinks object containing SponsoredAd objects of
- # the query. If a _block_ is given, it will be passed the newly
- # created Page.
- #
- def sponsored_links(&block)
- doc = get_page(search_url)
- new_links = SponsoredLinks.new
-
- # top and side ads
- doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
- title = link.inner_text
- url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
-
- new_links << SponsoredAd.new(title,url)
- end
-
- block.call(new_links) if block
- return new_links
- end
-
- #
- # Returns the first sponsored link on the first page of results.
- #
- def top_sponsored_link
- top_sponsored_links.first
- end
-
- #
- # Iterates over the sponsored links on the first page of
- # results passing each to the specified _block_.
- #
- def each_sponsored_link(&block)
- sponsored_links.each(&block)
- end
-
- protected
-
- #
- # Returns the rank offset for the specified _page_index_.
- #
- def page_result_offset(page_index)
- (page_index.to_i - 1) * @results_per_page.to_i
- end
-
- #
- # Returns the in-Page index of the _result_index_.
- #
- def page_result_index(result_index)
- (result_index.to_i - 1) % @results_per_page.to_i
- end
-
- #
- # Returns the page index for the specified _result_index_
- #
- def result_page_index(result_index)
- ((result_index.to_i - 1) / @results_per_page.to_i) + 1
+ return expr.join(' ')
end
end
end
end