#-- # Copyright: Copyright (c) 2010-2011 RightScale, Inc. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # 'Software'), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #++ require File.expand_path(File.join(File.dirname(__FILE__), 'logger')) module RightScraper # Library main entry point. Instantiate this class and call the scrape # method to download or update a remote repository to the local disk and # run a scraper on the resulting files. class Scraper # (Array):: Scraped resources attr_reader :resources # Initialize scrape destination directory # # === Options # :kind:: Type of scraper that will traverse directory for resources, one of :cookbook or :workflow # :basedir:: Local directory where files are retrieved and scraped, use temporary directory if nil # :max_bytes:: Maximum number of bytes to read from remote repo, unlimited if nil # :max_seconds:: Maximum number of seconds to spend reading from remote repo, unlimited if nil def initialize(options={}) @temporary = !options.has_key?(:basedir) options[:basedir] ||= Dir.mktmpdir @logger = ScraperLogger.new @options = options.merge({:logger => @logger}) @resources = [] end # Scrape given repository, depositing files into the scrape # directory. Update content of unique directory incrementally # when possible with further calls. # # === Parameters # repo(Hash|RightScraper::Repositories::Base):: Repository to be scraped # Note: repo can either be a Hash or a RightScraper::Repositories::Base instance. # See the RightScraper::Repositories::Base class for valid Hash keys. # # === Block # If a block is given, it will be called back with progress information # the block should take four arguments: # - first argument is one of :begin, :commit, # :abort which signifies what # the scraper is trying to do and where it is when it does it # - second argument is a symbol describing the operation being performed # in an easy-to-match way # - third argument is optional further explanation # - fourth argument is the exception pending (only relevant for :abort) # # === Return # true:: If scrape was successful # false:: If scrape failed, call errors for information on failure # # === Raise # 'Invalid repository type':: If repository type is not known def scrape(repo, incremental=true, &callback) errorlen = errors.size repo = RightScraper::Repositories::Base.from_hash(repo) if repo.is_a?(Hash) @logger.callback = callback begin # 1. Retrieve the files retriever = nil @logger.operation(:retrieving, "from #{repo}") do retriever = repo.retriever(@options) retriever.retrieve if retriever.available? end # 2. Now scrape if there is a scraper in the options @logger.operation(:scraping, retriever.repo_dir) do if @options[:kind] options = @options.merge({:ignorable_paths => retriever.ignorable_paths, :repo_dir => retriever.repo_dir, :repository => retriever.repository}) scraper = RightScraper::Scrapers::Base.scraper(options) @resources += scraper.scrape end end # 3. Cleanup if temporary FileUtils.remove_entry_secure(@options[:basedir]) if @temporary rescue # logger handles communication with the end user and appending # to our error list, we just need to keep going. end @logger.callback = nil errors.size == errorlen end # Path to directory where given repo should be or was downloaded # # === Parameters # repo(Hash|RightScraper::Repositories::Base):: Remote repository corresponding to local directory # # === Return # String:: Path to local directory that corresponds to given repository def repo_dir(repo) RightScraper::Retrievers::Base.repo_dir(@options[:basedir], repo) end # (Array):: Error messages in case of failure def errors @logger.errors end # Was scraping successful? # Call errors to get error messages if false # # === Return # Boolean:: true if scrape finished with no error, false otherwise. def succeeded? errors.empty? end alias_method :successful?, :succeeded? end end