lib/right_scraper/main.rb in right_scraper-5.0.1 vs lib/right_scraper/main.rb in right_scraper-5.1.1
- old
+ new
@@ -1,7 +1,7 @@
#--
-# Copyright: Copyright (c) 2010-2013 RightScale, Inc.
+# Copyright: Copyright (c) 2010-2016 RightScale, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# 'Software'), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
@@ -22,10 +22,11 @@
#++
# ancestor
require 'right_scraper'
+require 'right_support'
require 'fileutils'
module RightScraper
# Library main entry point. Instantiate this class and call the scrape
@@ -46,37 +47,48 @@
# <tt>:kind</tt>:: Type of scraper that will traverse directory for resources, one of :cookbook or :workflow
# <tt>:basedir</tt>:: Local directory where files are retrieved and scraped, use temporary directory if nil
# <tt>:max_bytes</tt>:: Maximum number of bytes to read from remote repo, unlimited if nil
# <tt>:max_seconds</tt>:: Maximum number of seconds to spend reading from remote repo, unlimited if nil
def initialize(options={})
- options = {
+ options = ::RightSupport::Data::Mash.new(
:kind => nil,
:basedir => nil,
:max_bytes => nil,
:max_seconds => nil,
- :callback => nil,
:logger => nil,
:s3_key => nil,
:s3_secret => nil,
:s3_bucket => nil,
- :errors => nil,
- :warnings => nil,
:scanners => nil,
:builders => nil,
- }.merge(options)
+ ).merge(options)
+ @old_logger_callback = nil
@temporary = !options.has_key?(:basedir)
options[:basedir] ||= Dir.mktmpdir
options[:logger] ||= ::RightScraper::Loggers::Default.new
@logger = options[:logger]
@resources = []
+ options[:errors] = @logger.errors
+ options[:warnings] = @logger.warnings
+
+ # load classes from scanners and builders options, if necessary.
+ [:scanners, :builders].each do |k|
+ list = options[k] || []
+ list.each_with_index do |clazz, index|
+ unless clazz.kind_of?(::Class)
+ list[index] = ::Object.const_get(clazz)
+ end
+ end
+ end
@options = options
end
- # Scrape given repository, depositing files into the scrape
- # directory. Update content of unique directory incrementally
- # when possible with further calls.
+ # Scrapes and scans a given repository.
#
+ # @deprecated the newer methodology will perform these operations in stages
+ # controlled externally instead of calling this all-in-one method.
+ #
# === Parameters
# repo(Hash|RightScraper::Repositories::Base):: Repository to be scraped
# Note: repo can either be a Hash or a RightScraper::Repositories::Base instance.
# See the RightScraper::Repositories::Base class for valid Hash keys.
#
@@ -96,84 +108,121 @@
# false:: If scrape failed, call errors for information on failure
#
# === Raise
# 'Invalid repository type':: If repository type is not known
def scrape(repo, incremental=true, &callback)
- errorlen = errors.size
- repo = RightScraper::Repositories::Base.from_hash(repo) if repo.is_a?(Hash)
+ @old_logger_callback = @logger.callback
@logger.callback = callback
+ errorlen = errors.size
begin
- # 1. Retrieve the files
- retriever = nil
- repo_dir_changed = false
- @logger.operation(:retrieving, "from #{repo}") do
- # note that the retriever type may be unavailable but allow the
- # retrieve method to raise any such error.
- retriever = repo.retriever(@options)
- repo_dir_changed = retriever.retrieve
+ if retrieved = retrieve(repo, &callback)
+ scan(retrieved, &callback)
end
+ rescue Exception
+ # legacy logger handles communication with the end user and appending
+ # to our error list; we just need to keep going. the new methodology
+ # has no such guaranteed communication so the caller will decide how to
+ # handle errors, etc.
+ ensure
+ cleanup
+ end
+ errors.size == errorlen
+ end
- # TEAL FIX: Note that retrieve will now return true iff there has been
- # a change to the last scraped repository directory for efficiency
- # reasons and only for retreiver types that support this behavior.
- #
- # Even if the retrieval is skipped due to already having the data on
- # disk we still need to scrape its resources only because of the case
- # of the metadata scraper daemon, which updates multiple repositories
- # of similar criteria.
- #
- # The issue is that a new repo can appear later with the same criteria
- # as an already-scraped repo and will need it's own copy of the
- # scraped resources. The easiest (but not most efficient) way to
- # deliver these is to rescrape the already-seen resources. This
- # becomes more expensive as we rely on generating "metadata.json" from
- # "metadata.rb" for cookbooks but is likely not expensive enough to
- # need to improve this logic.
+ # Retrieves the given repository. See #scrape for details.
+ def retrieve(repo)
+ errorlen = errors.size
+ unless repo.kind_of?(::RightScraper::Repositories::Base)
+ repo = RightScraper::Repositories::Base.from_hash(::RightSupport::Data::Mash.new(repo))
+ end
+ retriever = nil
+ # 1. Retrieve the files
+ @logger.operation(:retrieving, "from #{repo}") do
+ # note that the retriever type may be unavailable but allow the
+ # retrieve method to raise any such error.
+ retriever = repo.retriever(@options)
+ retriever.retrieve
+ end
- # 2. Now scrape if there is a scraper in the options
- @logger.operation(:scraping, retriever.repo_dir) do
- if @options[:kind]
- options = @options.merge({:ignorable_paths => retriever.ignorable_paths,
- :repo_dir => retriever.repo_dir,
- :repository => retriever.repository})
- scraper = RightScraper::Scrapers::Base.scraper(options)
- @resources += scraper.scrape
- end
- end
- rescue Exception
- # logger handles communication with the end user and appending
- # to our error list, we just need to keep going.
- ensure
- # ensure basedir is always removed if temporary (even with errors).
- ::FileUtils.remove_entry_secure(@options[:basedir]) rescue nil if @temporary
+ if errors.size == errorlen
+ # create the freed directory with world-writable permission for
+ # subsequent scan output for less-privileged child processes.
+ freed_base_path = freed_dir(repo)
+ ::FileUtils.rm_rf(freed_base_path) if ::File.exist?(freed_base_path)
+ ::FileUtils.mkdir_p(freed_base_path)
+ ::File.chmod(0777, freed_base_path)
+
+ # the following hash is needed for running any subsequent scanners.
+ {
+ ignorable_paths: retriever.ignorable_paths,
+ repo_dir: retriever.repo_dir,
+ freed_dir: freed_base_path,
+ repository: retriever.repository
+ }
+ else
+ nil
end
- @logger.callback = nil
+ end
+
+ # Scans a local directory. See #scrape for details.
+ def scan(retrieved)
+ errorlen = errors.size
+ old_callback = @logger.callback
+ options = ::RightSupport::Data::Mash.new(@options).merge(retrieved)
+ repo = options[:repository]
+ unless repo.kind_of?(::RightScraper::Repositories::Base)
+ repo = RightScraper::Repositories::Base.from_hash(::RightSupport::Data::Mash.new(repo))
+ options[:repository] = repo
+ end
+ @logger.operation(:scraping, options[:repo_dir]) do
+ scraper = ::RightScraper::Scrapers::Base.scraper(options)
+ @resources += scraper.scrape
+ end
errors.size == errorlen
end
+ # base directory for any file operations.
+ def base_dir
+ @options[:basedir]
+ end
+
+ # cleans up temporary files, etc.
+ def cleanup
+ @logger.callback = @old_logger_callback
+ @old_logger_callback = nil
+ ::FileUtils.remove_entry_secure(base_dir) rescue nil if @temporary
+ end
+
# Path to directory where given repo should be or was downloaded
#
# === Parameters
# repo(Hash|RightScraper::Repositories::Base):: Remote repository corresponding to local directory
#
# === Return
# String:: Path to local directory that corresponds to given repository
def repo_dir(repo)
- RightScraper::Retrievers::Base.repo_dir(@options[:basedir], repo)
+ RightScraper::Retrievers::Base.repo_dir(base_dir, repo)
end
+ # Path to directory where scanned artifacts can by copied out of containment
+ # due to lack of permissions to write to other directories. the freed files
+ # can then be reused by subsequent scanners, etc.
+ def freed_dir(repo)
+ ::File.expand_path('../freed', repo_dir(repo))
+ end
+
# (Array):: Error messages in case of failure
def errors
@logger.errors
end
# (Array):: Warnings or empty
def warnings
@logger.warnings
end
- # Was scraping successful?
+ # Was scraping successful?
# Call errors to get error messages if false
#
# === Return
# Boolean:: true if scrape finished with no error, false otherwise.
def succeeded?