#-- # Copyright: Copyright (c) 2010-2016 RightScale, Inc. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # 'Software'), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #++ # ancestor require 'right_scraper' require 'right_support' require 'fileutils' module RightScraper # Library main entry point. Instantiate this class and call the scrape # method to download or update a remote repository to the local disk and # run a scraper on the resulting files. # # Note that this class was known as Scraper in v1-3 but the name was confusing # due to the Scrapers module performing only a subset of the main Scraper # class functionality. class Main # (Array):: Scraped resources attr_reader :resources # Initialize scrape destination directory # # === Options # :kind:: Type of scraper that will traverse directory for resources, one of :cookbook or :workflow # :basedir:: Local directory where files are retrieved and scraped, use temporary directory if nil # :max_bytes:: Maximum number of bytes to read from remote repo, unlimited if nil # :max_seconds:: Maximum number of seconds to spend reading from remote repo, unlimited if nil def initialize(options={}) options = ::RightSupport::Data::Mash.new( :kind => nil, :basedir => nil, :max_bytes => nil, :max_seconds => nil, :logger => nil, :s3_key => nil, :s3_secret => nil, :s3_bucket => nil, :scanners => nil, :builders => nil, ).merge(options) @old_logger_callback = nil @temporary = !options.has_key?(:basedir) options[:basedir] ||= Dir.mktmpdir options[:logger] ||= ::RightScraper::Loggers::Default.new @logger = options[:logger] @resources = [] options[:errors] = @logger.errors options[:warnings] = @logger.warnings # load classes from scanners and builders options, if necessary. [:scanners, :builders].each do |k| list = options[k] || [] list.each_with_index do |clazz, index| unless clazz.kind_of?(::Class) list[index] = ::Object.const_get(clazz) end end end @options = options end # Scrapes and scans a given repository. # # @deprecated the newer methodology will perform these operations in stages # controlled externally instead of calling this all-in-one method. # # === Parameters # repo(Hash|RightScraper::Repositories::Base):: Repository to be scraped # Note: repo can either be a Hash or a RightScraper::Repositories::Base instance. # See the RightScraper::Repositories::Base class for valid Hash keys. # # === Block # If a block is given, it will be called back with progress information # the block should take four arguments: # - first argument is one of :begin, :commit, # :abort which signifies what # the scraper is trying to do and where it is when it does it # - second argument is a symbol describing the operation being performed # in an easy-to-match way # - third argument is optional further explanation # - fourth argument is the exception pending (only relevant for :abort) # # === Return # true:: If scrape was successful # false:: If scrape failed, call errors for information on failure # # === Raise # 'Invalid repository type':: If repository type is not known def scrape(repo, incremental=true, &callback) @old_logger_callback = @logger.callback @logger.callback = callback errorlen = errors.size begin if retrieved = retrieve(repo, &callback) scan(retrieved, &callback) end rescue Exception # legacy logger handles communication with the end user and appending # to our error list; we just need to keep going. the new methodology # has no such guaranteed communication so the caller will decide how to # handle errors, etc. ensure cleanup end errors.size == errorlen end # Retrieves the given repository. See #scrape for details. def retrieve(repo) errorlen = errors.size unless repo.kind_of?(::RightScraper::Repositories::Base) repo = RightScraper::Repositories::Base.from_hash(::RightSupport::Data::Mash.new(repo)) end retriever = nil # 1. Retrieve the files @logger.operation(:retrieving, "from #{repo}") do # note that the retriever type may be unavailable but allow the # retrieve method to raise any such error. retriever = repo.retriever(@options) retriever.retrieve end if errors.size == errorlen # create the freed directory with world-writable permission for # subsequent scan output for less-privileged child processes. freed_base_path = freed_dir(repo) ::FileUtils.rm_rf(freed_base_path) if ::File.exist?(freed_base_path) ::FileUtils.mkdir_p(freed_base_path) ::File.chmod(0777, freed_base_path) # the following hash is needed for running any subsequent scanners. { ignorable_paths: retriever.ignorable_paths, repo_dir: retriever.repo_dir, freed_dir: freed_base_path, repository: retriever.repository } else nil end end # Scans a local directory. See #scrape for details. def scan(retrieved) errorlen = errors.size old_callback = @logger.callback options = ::RightSupport::Data::Mash.new(@options).merge(retrieved) repo = options[:repository] unless repo.kind_of?(::RightScraper::Repositories::Base) repo = RightScraper::Repositories::Base.from_hash(::RightSupport::Data::Mash.new(repo)) options[:repository] = repo end @logger.operation(:scraping, options[:repo_dir]) do scraper = ::RightScraper::Scrapers::Base.scraper(options) @resources += scraper.scrape end errors.size == errorlen end # base directory for any file operations. def base_dir @options[:basedir] end # cleans up temporary files, etc. def cleanup @logger.callback = @old_logger_callback @old_logger_callback = nil ::FileUtils.remove_entry_secure(base_dir) rescue nil if @temporary end # Path to directory where given repo should be or was downloaded # # === Parameters # repo(Hash|RightScraper::Repositories::Base):: Remote repository corresponding to local directory # # === Return # String:: Path to local directory that corresponds to given repository def repo_dir(repo) RightScraper::Retrievers::Base.repo_dir(base_dir, repo) end # Path to directory where scanned artifacts can by copied out of containment # due to lack of permissions to write to other directories. the freed files # can then be reused by subsequent scanners, etc. def freed_dir(repo) ::File.expand_path('../freed', repo_dir(repo)) end # (Array):: Error messages in case of failure def errors @logger.errors end # (Array):: Warnings or empty def warnings @logger.warnings end # Was scraping successful? # Call errors to get error messages if false # # === Return # Boolean:: true if scrape finished with no error, false otherwise. def succeeded? errors.empty? end alias_method :successful?, :succeeded? end end