lib/gsearch-parser.rb in gsearch-parser-0.2.4 vs lib/gsearch-parser.rb in gsearch-parser-0.3.0
- old
+ new
@@ -1,40 +1,63 @@
-require "gsearch-parser/version"
require 'open-uri'
require 'nokogiri'
+#
+# Module method definitions
+#
module GSearchParser
+ # Entry method for performing a web search
def GSearchParser.webSearch(query)
- GoogleWebSearch.new(query)
+ webSearch = GoogleWebSearch.new(query)
end
end
-###################################################
-# #
-# GoogleWebSearch Class #
-# #
-###################################################
+#
+# Google Web Search class
+#
class GoogleWebSearch
- attr_accessor :results
-
+ attr_accessor :results, :currentPage
+ @index
+
# Class initializer
def initialize(query)
- # Initialize array
+ # Initialize variables
@results = Array.new
+ @index = 0
- # TODO: Format query
+ # Update the results list: (Fetch, Store, and Parse)
+ updateResults("http://google.com/search?sourceid=chrome&q=#{query}")
+ end
- # Fetch page
- searchPage = Nokogiri::HTML(open("http://google.com/search?sourceid=chrome&q=#{query}",
- 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.152 Safari/535.19'))
+ # Update the WebSearch results array by performing a Fetch, Store, Parse routine
+ def updateResults(url)
+ # Fetch
+ searchPage = fetchPage(url)
+ # Store
+ @currentPage = searchPage
+
+ # Parse
+ parseCurrentPage
+ end
+
+ # Fetch the page from a URL
+ def fetchPage(url)
+ Nokogiri::HTML(open(url, 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.152 Safari/535.19'))
+ end
+
+ # Parse the current page and populate results
+ def parseCurrentPage
+ # Initialize local variables
+ currentResults = Array.new
+
# Iterate over each Google result list element
- searchPage.css('li.g').each do |result|
+ @currentPage.css('li.g').each do |result|
# Extract the title
- title = result.css('h3 > a').first.inner_html
+ title = result.css('h3.r a').first.inner_html
# Extract the content. There is the possibility for
# the content to be nil, so check for this
content = result.css('span.st').first.nil? ? '' : result.css('span.st').first.inner_html
@@ -45,33 +68,45 @@
unless uri.index('www.youtube.com').nil?
next
end
# Create a new Result object and append to the array
- @results << Result.new(title, content, uri)
+ currentResults << Result.new(title, content, uri)
end
+ @results += currentResults
+ return currentResults
end
+ # Parse the results from the next page and append to results list
+ def nextResults
+ # Parse next result page link
+ nextPageUrl = @currentPage.css("table#nav tr td a")[@index]['href']
+
+ # Increment reference index
+ @index += 1
+
+ # Update results
+ updateResults("http://www.google.com" + nextPageUrl)
+ end
+
# Iterator over results
def each(&blk)
@results.each(&blk)
end
- ###################################################
- # #
- # Result Class #
- # #
- ###################################################
- class Result
- attr_accessor :title, :content, :uri
+end # GoogleWebSearch
- # Class initializer
- def initialize(title, content, uri)
- @title = title
- @content = content
- @uri = uri
- end
+#
+# Result class
+#
+class Result
+ attr_accessor :title, :content, :uri
- end # Result
+ # Class initializer
+ def initialize(title, content, uri)
+ @title = title
+ @content = content
+ @uri = uri
+ end
-end # GoogleSearch
+end # Result