lib/docparser/parser.rb in docparser-0.0.1 vs lib/docparser/parser.rb in docparser-0.1.0

- old
+ new

@@ -1,105 +1,136 @@ -$:.unshift __dir__ +$LOAD_PATH.unshift __dir__ require 'rubygems' require 'bundler/setup' require 'version' require 'output' require 'document' require 'nokogiri' require 'open-uri' require 'parallel' require 'set' +require 'log4r' +require 'log4r/formatter/patternformatter' require 'output/screen_output.rb' require 'output/csv_output.rb' require 'output/html_output.rb' require 'output/xlsx_output.rb' require 'output/yaml_output.rb' require 'output/json_output.rb' require 'output/multi_output.rb' -# {include:file:README.md} +require 'output/nil_output.rb' + +Log4r.define_levels(*Log4r::Log4rConfig::LogLevels) +logger = Log4r::Logger.new('docparser') +output = Log4r::StdoutOutputter.new('docparser') +output.formatter = Log4r::PatternFormatter.new(pattern: '[%l %C] %d :: %m') +logger.outputters = output +logger.level = Log4r::INFO +logger = nil +output = nil + +# The DocParser namespace +# See README.md for information on using DocParser module DocParser # The main parser class. This is the class you'll use to create your parser # The real work happens in the Document class # @see Document class Parser # @!visibility private - attr_reader :outputs + attr_reader :outputs, :files, :num_processes, :encoding # Creates a new parser instance # @param files [Array] An array containing URLs or paths to files # @param quiet [Boolean] Be quiet # @param encoding [String] The encoding to use for opening the files # @param parallel [Boolean] Use parallel processing # @param output [Output, Array] The output(s), defaults to a Screenoutput # @param range [Range] Range of files to process (nil means process all) # @param num_processes [Fixnum] Number of parallel processes def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true, - output: ScreenOutput.new, range: nil, + output: nil, range: nil, num_processes: Parallel.processor_count + 1) - @quiet = quiet - @parallel = parallel - @num_processes = num_processes + @num_processes = parallel ? num_processes : 1 + @files = range ? files[range] : files @encoding = encoding - if output.is_a? Output - @outputs = [] - @outputs << output - elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output } - @outputs = output - else - raise ArgumentError, 'No outputs specified' + + Log4r::Logger['docparser'].level = quiet ? Log4r::ERROR : Log4r::INFO + + unless output.nil? + if output.is_a? Output + @outputs = [] + @outputs << output + elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output } + @outputs = output + else + raise ArgumentError, 'Invalid outputs specified' + end + + @resultsets = Array.new(@outputs.length) { Set.new } end - @files = if range - files[range] - else - files - end - log 'DocParser loaded..' - log "#{@files.length} files loaded (encoding: #{@encoding})" + + @logger = Log4r::Logger.new('docparser::parser') + @logger.info "DocParser v#{VERSION}" + @logger.info "#{@files.length} files loaded (encoding: #{@encoding})" end # # Parses the `files` # def parse!(&block) - log "Parsing #{@files.length} files." + @logger.info "Parsing #{@files.length} files." start_time = Time.now - resultsets = Array.new(@outputs.length) { Set.new } - if @parallel && @num_processes > 1 - log "Starting #{@num_processes} processes" - Parallel.map(@files, in_processes: @num_processes) do |file| - Document.new(file, encoding: @encoding, parser: self).parse!(&block) - end.each do |result| - result.each_with_index { |set, index| resultsets[index].merge(set) } - end - log 'Parallel processing finished, writing results..' + if @num_processes > 1 + parallel_process(&block) else - @files.each do |file| - doc = Document.new(file, encoding: @encoding, parser: self) - doc.parse!(&block).each_with_index do |set, index| - resultsets[index].merge(set) - end + serial_process(&block) + end + + @logger.info 'Processing finished' + + write_to_outputs if @outputs + + @logger.info sprintf('Done processing in %.2fs.', Time.now - start_time) + end + + private + + def parallel_process(&block) + @logger.info "Starting #{@num_processes} processes" + Parallel.map(@files, in_processes: @num_processes) do |file| + # :nocov: # + parse_doc(file, &block) + # :nocov: # + end.each do |result| + result.each_with_index do |set, index| + @resultsets[index].merge(set) + end if @outputs + end + end + + def serial_process(&block) + @files.each do |file| + parse_doc(file, &block).each_with_index do |set, index| + @resultsets[index].merge(set) if @outputs end end + end - log "\nSummary\n=======" + def parse_doc(file, &block) + doc = Document.new(filename: file, encoding: @encoding, parser: self) + doc.parse!(&block) + end + def write_to_outputs + @logger.info 'Writing data..' @outputs.each_with_index do |output, index| - resultsets[index].each do |row| + @resultsets[index].each do |row| output.add_row row end - resultsets[index] = nil + @resultsets[index] = nil output.close - log output.summary end - - log '' - log 'Done processing in %.2fs.' % (Time.now - start_time) end - private - - def log(str) - puts str unless @quiet - end end end \ No newline at end of file