#--
# Copyright: Copyright (c) 2010-2011 RightScale, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# 'Software'), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#++
require File.expand_path(File.join(File.dirname(__FILE__), 'logger'))
module RightScraper
# Library main entry point. Instantiate this class and call the scrape
# method to download or update a remote repository to the local disk and
# run a scraper on the resulting files.
class Scraper
# (Array):: Scraped resources
attr_reader :resources
# Initialize scrape destination directory
#
# === Options
# :kind:: Type of scraper that will traverse directory for resources, one of :cookbook or :workflow
# :basedir:: Local directory where files are retrieved and scraped, use temporary directory if nil
# :max_bytes:: Maximum number of bytes to read from remote repo, unlimited if nil
# :max_seconds:: Maximum number of seconds to spend reading from remote repo, unlimited if nil
def initialize(options={})
@temporary = !options.has_key?(:basedir)
options[:basedir] ||= Dir.mktmpdir
@logger = ScraperLogger.new
@options = options.merge({:logger => @logger})
@resources = []
end
# Scrape given repository, depositing files into the scrape
# directory. Update content of unique directory incrementally
# when possible with further calls.
#
# === Parameters
# repo(Hash|RightScraper::Repositories::Base):: Repository to be scraped
# Note: repo can either be a Hash or a RightScraper::Repositories::Base instance.
# See the RightScraper::Repositories::Base class for valid Hash keys.
#
# === Block
# If a block is given, it will be called back with progress information
# the block should take four arguments:
# - first argument is one of :begin, :commit,
# :abort which signifies what
# the scraper is trying to do and where it is when it does it
# - second argument is a symbol describing the operation being performed
# in an easy-to-match way
# - third argument is optional further explanation
# - fourth argument is the exception pending (only relevant for :abort)
#
# === Return
# true:: If scrape was successful
# false:: If scrape failed, call errors for information on failure
#
# === Raise
# 'Invalid repository type':: If repository type is not known
def scrape(repo, incremental=true, &callback)
errorlen = errors.size
repo = RightScraper::Repositories::Base.from_hash(repo) if repo.is_a?(Hash)
@logger.callback = callback
begin
# 1. Retrieve the files
retriever = nil
@logger.operation(:retrieving, "from #{repo}") do
retriever = repo.retriever(@options)
retriever.retrieve if retriever.available?
end
# 2. Now scrape if there is a scraper in the options
@logger.operation(:scraping, retriever.repo_dir) do
if @options[:kind]
options = @options.merge({:ignorable_paths => retriever.ignorable_paths,
:repo_dir => retriever.repo_dir,
:repository => retriever.repository})
scraper = RightScraper::Scrapers::Base.scraper(options)
@resources += scraper.scrape
end
end
# 3. Cleanup if temporary
FileUtils.remove_entry_secure(@options[:basedir]) if @temporary
rescue
# logger handles communication with the end user and appending
# to our error list, we just need to keep going.
end
@logger.callback = nil
errors.size == errorlen
end
# Path to directory where given repo should be or was downloaded
#
# === Parameters
# repo(Hash|RightScraper::Repositories::Base):: Remote repository corresponding to local directory
#
# === Return
# String:: Path to local directory that corresponds to given repository
def repo_dir(repo)
RightScraper::Retrievers::Base.repo_dir(@options[:basedir], repo)
end
# (Array):: Error messages in case of failure
def errors
@logger.errors
end
# Was scraping successful?
# Call errors to get error messages if false
#
# === Return
# Boolean:: true if scrape finished with no error, false otherwise.
def succeeded?
errors.empty?
end
alias_method :successful?, :succeeded?
end
end