main.rb in right_scraper-5.1.1

- old
+ new

@@ -1,7 +1,7 @@
 #--
-# Copyright: Copyright (c) 2010-2013 RightScale, Inc.
+# Copyright: Copyright (c) 2010-2016 RightScale, Inc.
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
 # 'Software'), to deal in the Software without restriction, including
 # without limitation the rights to use, copy, modify, merge, publish,
@@ -22,10 +22,11 @@
 #++
 
 # ancestor
 require 'right_scraper'
 
+require 'right_support'
 require 'fileutils'
 
 module RightScraper
 
   # Library main entry point. Instantiate this class and call the scrape
@@ -46,37 +47,48 @@
     # <tt>:kind</tt>:: Type of scraper that will traverse directory for resources, one of :cookbook or :workflow
     # <tt>:basedir</tt>:: Local directory where files are retrieved and scraped, use temporary directory if nil
     # <tt>:max_bytes</tt>:: Maximum number of bytes to read from remote repo, unlimited if nil
     # <tt>:max_seconds</tt>:: Maximum number of seconds to spend reading from remote repo, unlimited if nil
     def initialize(options={})
-      options = {
+      options = ::RightSupport::Data::Mash.new(
         :kind        => nil,
         :basedir     => nil,
         :max_bytes   => nil,
         :max_seconds => nil,
-        :callback    => nil,
         :logger      => nil,
         :s3_key      => nil,
         :s3_secret   => nil,
         :s3_bucket   => nil,
-        :errors      => nil,
-        :warnings    => nil,
         :scanners    => nil,
         :builders    => nil,
-      }.merge(options)
+      ).merge(options)
+      @old_logger_callback = nil
       @temporary = !options.has_key?(:basedir)
       options[:basedir] ||= Dir.mktmpdir
       options[:logger] ||= ::RightScraper::Loggers::Default.new
       @logger = options[:logger]
       @resources = []
+      options[:errors] = @logger.errors
+      options[:warnings] = @logger.warnings
+
+      # load classes from scanners and builders options, if necessary.
+      [:scanners, :builders].each do |k|
+        list = options[k] || []
+        list.each_with_index do |clazz, index|
+          unless clazz.kind_of?(::Class)
+            list[index] = ::Object.const_get(clazz)
+          end
+        end
+      end
       @options = options
     end
 
-    # Scrape given repository, depositing files into the scrape
-    # directory.  Update content of unique directory incrementally
-    # when possible with further calls.
+    # Scrapes and scans a given repository.
     #
+    # @deprecated the newer methodology will perform these operations in stages
+    # controlled externally instead of calling this all-in-one method.
+    #
     # === Parameters
     # repo(Hash|RightScraper::Repositories::Base):: Repository to be scraped
     #   Note: repo can either be a Hash or a RightScraper::Repositories::Base instance.
     #         See the RightScraper::Repositories::Base class for valid Hash keys.
     #
@@ -96,84 +108,121 @@
     # false:: If scrape failed, call errors for information on failure
     #
     # === Raise
     # 'Invalid repository type':: If repository type is not known
     def scrape(repo, incremental=true, &callback)
-      errorlen = errors.size
-      repo = RightScraper::Repositories::Base.from_hash(repo) if repo.is_a?(Hash)
+      @old_logger_callback = @logger.callback
       @logger.callback = callback
+      errorlen = errors.size
       begin
-        # 1. Retrieve the files
-        retriever = nil
-        repo_dir_changed = false
-        @logger.operation(:retrieving, "from #{repo}") do
-          # note that the retriever type may be unavailable but allow the
-          # retrieve method to raise any such error.
-          retriever = repo.retriever(@options)
-          repo_dir_changed = retriever.retrieve
+        if retrieved = retrieve(repo, &callback)
+          scan(retrieved, &callback)
         end
+      rescue Exception
+        # legacy logger handles communication with the end user and appending
+        # to our error list; we just need to keep going. the new methodology
+        # has no such guaranteed communication so the caller will decide how to
+        # handle errors, etc.
+      ensure
+        cleanup
+      end
+      errors.size == errorlen
+    end
 
-        # TEAL FIX: Note that retrieve will now return true iff there has been
-        # a change to the last scraped repository directory for efficiency
-        # reasons and only for retreiver types that support this behavior.
-        #
-        # Even if the retrieval is skipped due to already having the data on
-        # disk we still need to scrape its resources only because of the case
-        # of the metadata scraper daemon, which updates multiple repositories
-        # of similar criteria.
-        #
-        # The issue is that a new repo can appear later with the same criteria
-        # as an already-scraped repo and will need it's own copy of the
-        # scraped resources. The easiest (but not most efficient) way to
-        # deliver these is to rescrape the already-seen resources. This
-        # becomes more expensive as we rely on generating "metadata.json" from
-        # "metadata.rb" for cookbooks but is likely not expensive enough to
-        # need to improve this logic.
+    # Retrieves the given repository. See #scrape for details.
+    def retrieve(repo)
+      errorlen = errors.size
+      unless repo.kind_of?(::RightScraper::Repositories::Base)
+        repo = RightScraper::Repositories::Base.from_hash(::RightSupport::Data::Mash.new(repo))
+      end
+      retriever = nil
 
+      # 1. Retrieve the files
+      @logger.operation(:retrieving, "from #{repo}") do
+        # note that the retriever type may be unavailable but allow the
+        # retrieve method to raise any such error.
+        retriever = repo.retriever(@options)
+        retriever.retrieve
+      end
 
-        # 2. Now scrape if there is a scraper in the options
-        @logger.operation(:scraping, retriever.repo_dir) do
-          if @options[:kind]
-            options = @options.merge({:ignorable_paths => retriever.ignorable_paths,
-                                      :repo_dir        => retriever.repo_dir,
-                                      :repository      => retriever.repository})
-            scraper = RightScraper::Scrapers::Base.scraper(options)
-            @resources += scraper.scrape
-          end
-        end
-      rescue Exception
-        # logger handles communication with the end user and appending
-        # to our error list, we just need to keep going.
-      ensure
-        # ensure basedir is always removed if temporary (even with errors).
-        ::FileUtils.remove_entry_secure(@options[:basedir]) rescue nil if @temporary
+      if errors.size == errorlen
+        # create the freed directory with world-writable permission for
+        # subsequent scan output for less-privileged child processes.
+        freed_base_path = freed_dir(repo)
+        ::FileUtils.rm_rf(freed_base_path) if ::File.exist?(freed_base_path)
+        ::FileUtils.mkdir_p(freed_base_path)
+        ::File.chmod(0777, freed_base_path)
+
+        # the following hash is needed for running any subsequent scanners.
+        {
+          ignorable_paths: retriever.ignorable_paths,
+          repo_dir: retriever.repo_dir,
+          freed_dir: freed_base_path,
+          repository: retriever.repository
+        }
+      else
+        nil
       end
-      @logger.callback = nil
+    end
+
+    # Scans a local directory. See #scrape for details.
+    def scan(retrieved)
+      errorlen = errors.size
+      old_callback = @logger.callback
+      options = ::RightSupport::Data::Mash.new(@options).merge(retrieved)
+      repo = options[:repository]
+      unless repo.kind_of?(::RightScraper::Repositories::Base)
+        repo = RightScraper::Repositories::Base.from_hash(::RightSupport::Data::Mash.new(repo))
+        options[:repository] = repo
+      end
+      @logger.operation(:scraping, options[:repo_dir]) do
+        scraper = ::RightScraper::Scrapers::Base.scraper(options)
+        @resources += scraper.scrape
+      end
       errors.size == errorlen
     end
 
+    # base directory for any file operations.
+    def base_dir
+      @options[:basedir]
+    end
+
+    # cleans up temporary files, etc.
+    def cleanup
+      @logger.callback = @old_logger_callback
+      @old_logger_callback = nil
+      ::FileUtils.remove_entry_secure(base_dir) rescue nil if @temporary
+    end
+
     # Path to directory where given repo should be or was downloaded
     #
     # === Parameters
     # repo(Hash|RightScraper::Repositories::Base):: Remote repository corresponding to local directory
     #
     # === Return
     # String:: Path to local directory that corresponds to given repository
     def repo_dir(repo)
-      RightScraper::Retrievers::Base.repo_dir(@options[:basedir], repo)
+      RightScraper::Retrievers::Base.repo_dir(base_dir, repo)
     end
 
+    # Path to directory where scanned artifacts can by copied out of containment
+    # due to lack of permissions to write to other directories. the freed files
+    # can then be reused by subsequent scanners, etc.
+    def freed_dir(repo)
+      ::File.expand_path('../freed', repo_dir(repo))
+    end
+
     # (Array):: Error messages in case of failure
     def errors
       @logger.errors
     end
 
     # (Array):: Warnings or empty
     def warnings
       @logger.warnings
     end
 
-   # Was scraping successful?
+    # Was scraping successful?
     # Call errors to get error messages if false
     #
     # === Return
     # Boolean:: true if scrape finished with no error, false otherwise.
     def succeeded?