require 'gscraper/search/result' require 'gscraper/search/page' require 'gscraper/extensions/uri' require 'gscraper/licenses' require 'gscraper/gscraper' require 'hpricot' module GScraper module Search class Query SEARCH_URL = 'http://www.google.com/search' RESULTS_PER_PAGE = 10 # Results per-page attr_accessor :results_per_page # Search query attr_accessor :query # Search for results containing the exact phrase attr_accessor :exact_phrase # Search for results with the words attr_accessor :with_words # Search for results with-out the words attr_accessor :without_words # Search for results written in the language attr_accessor :language # Search for results from the region attr_accessor :region # Search for results in the format attr_accessor :in_format # Search for results not in the format attr_accessor :not_in_format # Search for results within the past day attr_accessor :within_past_day # Search for results within the past week attr_accessor :within_past_week # Search for results within the past months attr_accessor :within_past_months # Search for results within the past year attr_accessor :within_past_year # Search for results containing numbers between the range attr_accessor :numeric_range # Search for results where the query ocurrs within the area attr_accessor :occurrs_within # Search for results inside the domain attr_accessor :inside_domain # Search for results outside the domain attr_accessor :outside_domain # Search for results which have the rights attr_accessor :rights # Filter the search results attr_accessor :filtered # Search for results similar to the page attr_accessor :similar_to # Search for results linking to the page attr_accessor :links_to # # Creates a new Query object from the given search options. If a # block is given, it will be passed the newly created query object. # # Query.new(:query => 'ruby', :with_words => 'sow rspec') # # Query.new(:exact_phrase => 'fluent interfaces') do |q| # q.within_past_week = true # end # def initialize(opts={},&block) super() @results_per_page = opts[:results_per_page] || RESULTS_PER_PAGE @query = opts[:query] @exact_phrase = opts[:exact_phrase] @with_words = opts[:with_words] @without_words = opts[:without_words] @language = opts[:language] @region = opts[:region] @in_format = opts[:in_format] @not_in_format = opts[:not_in_format] if opts[:within_past_day] @within_past_day = opts[:within_past_day] elsif opts[:within_past_week] @within_past_week = opts[:within_past_week] elsif opts[:within_past_months] @within_past_months = opts[:within_past_months] elsif opts[:within_past_year] @within_past_year = opts[:within_past_year] end @numeric_range = opts[:numeric_range] @occurrs_within = opts[:occurrs_within] @inside_domain = opts[:inside_domain] @outside_domain = opts[:outside_domain] @rights = opts[:rights] @filtered = opts[:filtered] @similar_to = opts[:similar_to] @links_to = opts[:links_to] block.call(self) if block end # # Creates a new Query object from the specified URL. If a block is # given, it will be passed the newly created Query object. # # Query.from_url('http://www.google.com/search?q=ruby+zen) # # Query.from_url('http://www.google.com/search?q=ruby') do |q| # q.within_last_month = true # q.occurrs_within = :title # end # def self.from_url(url,&block) url = URI.parse(url) opts = {} opts[:results_per_page] = url.query_params['num'] opts[:query] = url.query_params['as_q'] opts[:exact_phrase] = url.query_params['as_epq'] opts[:with_words] = url.query_params['as_oq'] opts[:without_words] = url.query_params['as_eq'] opts[:language] = url.query_params['lr'] opts[:region] = url.query_params['cr'] case url.query_params['as_ft'] when 'i' opts[:in_format] = url.query_params['as_filetype'] when 'e' opts[:not_in_format] = url.query_params['as_filetype'] end case url.query_params['as_qdr'] when 'd' opts[:within_past_day] = true when 'w' opts[:within_past_week] = true when 'm' opts[:within_past_months] = 1 when 'm2' opts[:within_past_months] = 2 when 'm3' opts[:within_past_months] = 3 when 'm6' opts[:within_past_months] = 6 when 'y' opts[:within_past_year] = true end if (url.query_params['as_nlo'] || url.query_params['as_nhi']) opts[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i) end case url.query_params['as_occt'] when 'title' opts[:occurrs_within] = :title when 'body' opts[:occurrs_within] = :body when 'url' opts[:occurrs_within] = :url when 'links' opts[:occurrs_within] = :links end case url.query_params['as_dt'] when 'i' opts[:inside_domain] = url.query_params['as_sitesearch'] when 'e' opts[:outside_domain] = url.query_params['as_sitesearch'] end case url.query_params['as_rights'] when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' opts[:rights] = Licenses::CC_BY_NC_ND when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' opts[:rights] = Licenses::CC_BY_SA when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' opts[:rights] = Licenses::CC_BY_NC when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' opts[:rights] = Licenses::CC_BY end if url.query_params[:safe]=='active' opts[:filtered] = true end if url.query_params['as_rq'] opts[:similar_to] = url.query_params['as_rq'] elsif url.query_params['as_lq'] opts[:links_to] = url.query_params['as_lq'] end return self.new(opts,&block) end # # Returns the URL that represents the query. # def search_url url = URI.parse(SEARCH_URL) if @results_per_page url.query_params['num'] = @results_per_page end url.query_params['as_q'] = @query if @query url.query_params['as_epq'] = @exact_phrase if @exact_phrase url.query_params['as_oq'] = @with_words if @with_words url.query_params['as_eq'] = @without_words if @without_words url.query_params['lr'] = @language if @language url.query_params['cr'] = @region if @region if @in_format url.query_params['as_ft'] = 'i' url.query_params['as_filtetype'] = @in_format elsif @not_in_format url.query_params['as_ft'] = 'e' url.query_params['as_filtetype'] = @not_in_format end if @within_past_day url.query_params['as_qdr'] = 'd' elsif @within_past_week url.query_params['as_qdr'] = 'w' elsif @within_past_months case @within_past_months when 1 url.query_params['as_qdr'] = 'm' when 2 url.query_params['as_qdr'] = 'm2' when 3 url.query_params['as_qdr'] = 'm3' when 6 url.query_params['as_qdr'] = 'm6' end elsif @within_past_year url.query_params['as_qdr'] = 'y' end if @numeric_range url.query_params['as_nlo'] = @numeric_range.begin url.query_params['as_nhi'] = @numeric_range.end end case @occurrs_within when :title, 'title' url.query_params['as_occt'] = 'title' when :body, 'body' url.query_params['as_occt'] = 'body' when :url, 'url' url.query_params['as_occt'] = 'url' when :links, 'links' url.query_params['as_occt'] = 'links' end if @inside_domain url.query_params['as_dt'] = 'i' url.query_params['as_sitesearch'] = @inside_domain elsif @outside_domain url.query_params['as_dt'] = 'e' url.query_params['as_sitesearch'] = @outside_domain end case @rights when Licenses::CC_BY_NC_ND url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' when Licenses::CC_BY_SA url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' when Licenses::CC_BY_ND url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' when Licenses::CC_BY url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' end url.query_params['safe'] = true if @filtered if @similar_to url.query_params['as_rq'] = @similar_to elsif @links_to url.query_params['as_lq'] = @links_to end return url end # # Returns the URL that represents the query at the specific # _page_index_. # def page_url(page_index) url = search_url url.query_params['start'] = page_index_offset(page_index) url.query_params['sa'] = 'N' return url end # # Returns a Page object containing Result objects at the specified # _page_index_. If _opts_ are given, they will be used in accessing # the SEARCH_URL. # def page(page_index,opts={}) doc = Hpricot(GScraper.open(page_url(page_index),opts)) new_page = Page.new doc.search('//div.g').each_with_index do |result,index| rank = page_index_offset(page_index) + (index + 1) title = result.search('//h2.r').first.inner_text url = result.search('//h2.r/a').first.get_attribute('href') # TODO: exclude URL and Links from summary text summary = result.search('//td.j').first.inner_text # TODO: scrape Cached and Similar links new_page << Result.new(rank,title,url,summary) end return new_page end # # Returns the results on the first page. If _opts_ are given, they # will be used in accessing the SEARCH_URL. # def first_page(opts={}) page(1,opts) end # # Iterates over the results at the specified _page_index_, passing # each to the given _block_. If _opts_ are given they will be used # in accessing the SEARCH_URL. # # query.each_on_page(2) do |result| # puts result.title # end # def each_on_page(page_index,opts={},&block) page(page_index,opts).each(&block) end # # Iterates over the results on the first page, passing # each to the given _block_. If _opts_ are given, they will be used # in accessing the SEARCH_URL. # # query.each_on_first_page do |result| # puts result.url # end # def each_on_first_page(opts={},&block) each_on_page(1,opts,&block) end protected # # Returns the rank offset for the specified _page_index_. # def page_index_offset(page_index) (page_index.to_i - 1) * @result_per_page.to_i end end end end