lib/imw/tools/summarizer.rb in imw-0.2.4 vs lib/imw/tools/summarizer.rb in imw-0.2.5
- old
+ new
@@ -1,169 +1,67 @@
+require 'imw/tools/extension_analyzer'
+
module IMW
module Tools
# A class for producing summary data about a collection of
# resources.
#
- # This summary data includes the directory tree, file sizes, file
- # formats, record counts, &c.
+ # The Summarizer needs recursively IMW.open all files and
+ # directories given so will be very cumbersome if given many
+ # files. Few large files will not cause a problem.
class Summarizer
- # The inputs to this Summarizer.
+ # The inputs given to this Summarizer.
attr_reader :inputs
+ # The resources to this Summarizer, calculated recursively from
+ # its +inputs+.
+ attr_reader :resources
+
+ include IMW::Tools::ExtensionAnalyzer
+
# Initialize a new Summarizer with the given +inputs+.
#
# @param [Array<String, IMW::Resource>] inputs
# @return [IMW::Tools::Summarizer]
def initialize *inputs
self.inputs = inputs.flatten
end
- # Set new inputs for this summarizer.
- #
- # Clears any cached summary statistics
- #
- # @param [Array<String, IMW::Resource>] new_inputs
- def inputs= new_inputs
- @inputs = new_inputs.map do |input|
- i = IMW.open(input)
- raise PathError.new("Invalid input, #{i.path}") if i.is_local? && !i.exist? # don't check for remote files
- i.is_directory? ? i.resources : i
- end.compact.flatten
- clear_cached_statistics!
- end
-
- # Reset all the cached statistics of this summarizer to +nil+.
- def clear_cached_statistics!
- [:num_files,
- :num_direcories,
- :total_size,
- :extension_counts,
- :most_common_extension_by_count,
- :normalized_extension_counts,
- :extension_sizes,
- :most_common_extension_by_size,
- :normalized_extension_sizes].each do |instance_variable|
- self.instance_variable_set("@#{instance_variable}", nil)
- end
- end
-
- # Return the number of files.
- #
- # @return [Integer]
- def num_files
- @num_files ||= inputs.size
- end
-
- # Return the number of directories.
- #
- # @return [Integer]
- def num_directories
- @num_directories ||= inputs.collect { |input| input.is_directory? }
- end
-
# Return the total size.
#
# @return [Integer]
def total_size
- @total_size ||= inputs.map(&:size).inject(0) { |e, sum| sum += e }
+ @total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
end
- # Return the file counts of each extension.
+ # Return a summary of the +inputs+ to this Summarizer.
#
- # @return [Hash]
- def extension_counts
- @extension_counts ||= returning({}) do |counts|
- inputs.each do |input|
- next if input.is_directory?
- counts[input.extension] = 0 unless counts.has_key?(input.extension)
- counts[input.extension] += 1
- end
- end
- end
-
- # Return the most common extension by count of files.
- def most_common_extension_by_count
- return @most_common_extension_by_count if @most_common_extension_by_count
- current_count, current_extension = 0, nil
- extension_counts.each_pair do |extension, count|
- current_extension = extension if count > current_count
- end
- if current_extension.strip.blank? then current_extension = 'flat' end
- @most_common_extension_by_count = current_extension
- end
-
- # Return the file counts of each extension, normalized by the
- # total number of files.
+ # Delegates to the +summary+ method of each constituent
+ # IMW::Resource in +inputs+.
#
- # @return [Hash]
- def normalized_extension_counts
- @normalized_extension_counts ||= returning({}) do |weighted|
- extension_counts.each_pair do |extension, count|
- weighted[extension] = count.to_f / num_files.to_f
- end
- end
+ # @return [Array<Hash>]
+ def summary
+ @summary ||= inputs.map(&:summary)
end
- # Return the amount of data corresponding to each extension.
+ protected
+ # Set new inputs for this summarizer.
#
- # @return [Hash]
- def extension_sizes
- @extension_sizes ||= returning({}) do |sizes|
- inputs.each do |input|
- next if input.is_directory?
- sizes[input.extension] = 0 unless sizes.has_key?(input.extension)
- sizes[input.extension] += input.size
- end
- end
- end
-
- # Return the most common extension by amount of data.
+ # Summarizer statistics are cached as instance variables so be
+ # careful about changing inputs and then using old statistics...
#
- # @return [String]
- def most_common_extension_by_size
- return @most_common_extension_by_size if @most_common_extension_by_size
- current_size, current_extension = 0, nil
- extension_sizes.each_pair do |extension, size|
- current_extension = extension if size > current_size
+ # @param [Array<String, IMW::Resource>] new_inputs
+ def inputs= new_inputs
+ @inputs = new_inputs.map do |path_or_resource|
+ input = IMW.open(path_or_resource)
+ input.should_exist!("Cannot summarize.")
end
- if current_extension.strip.blank? then current_extension = 'flat' end
- @most_common_extension_by_size = current_extension
+ @resources = inputs.map do |input|
+ input.is_directory? ? input.all_resources : input
+ end.compact.flatten
end
- # Return the fractional share of each extension by file size.
- #
- # @return [Hash]
- def normalized_extension_sizes
- @normalized_extension_sizes ||= returning({}) do |weighted|
- extension_sizes.each_pair do |extension, size|
- weighted[extension] = size.to_f / total_size.to_f
- end
- end
- end
-
- # Return a guess as to the most common extension format for this
- # Summarizer's inputs.
- #
- # @return [String]
- def most_common_extension
- return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
- count_fraction = normalized_extension_counts[most_common_extension_by_count]
- size_fraction = normalized_extension_sizes[most_common_extension_by_size]
- return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # choose the winner based on differential
- return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
- most_common_extension_by_size # default to size
- end
-
- # Returns a guess as to the most common data format for this
- # Summarizer's inputs.
- #
- # @return [String]
- def most_common_data_format
- extension = most_common_extension
- ['tar', 'tar.bz2', 'tar.gz', 'tgz', 'tbz2', 'zip', 'rar'].include?(extension) ? 'archive' : extension
- end
-
end
end
end