lib/docparser/document.rb in docparser-0.2.3 vs lib/docparser/document.rb in docparser-0.3.0

- old
+ new

@@ -1,5 +1,7 @@ +# frozen_string_literal: true + require 'nokogiri' module DocParser # The Document class loads and parses the files. # @see Parser # @see Output @@ -17,46 +19,52 @@ attr_reader :results # @return [String] the source of the document attr_reader :html - def initialize(filename: nil, encoding: 'utf-8', parser: nil) - @logger = Log4r::Logger.new('docparser::document') - @logger.debug { "Parsing #{filename}" } + def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil) + @logger = logger || Logger.new(STDERR) + @logger.level = Logger::INFO + @logger.debug("Parsing #{filename}") @encoding = encoding @parser = parser @filename = filename @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] } read_file end # Adds a row to an output def add_row(*row, output: 0) output = @parser.outputs.index(output) if output.is_a? Output - @logger.debug { "#{filename}: Adding row #{row.flatten}" } + @logger.debug("#{filename}: Adding row #{row.flatten}") results[output] << row.flatten end # Extracts the document title # @return [String] the title of the document def title @title ||= xpath_content('//head/title') end - # Executes a xpath query - def xpath(query) - res = @doc.search(query) + # Executes a xpath/css query + def elements(query) + @doc.search(query) + end + + def each_element(query) + res = elements(query) + if block_given? res.each { |el| yield el } else res end end # Executes a xpath query and returns the content # @return [String] the content of the HTML node - def xpath_content(query) + def element_content(query) first = @doc.search(query).first if first.nil? nil else first.content @@ -89,9 +97,11 @@ @logger.warn "#{filename} is empty" if @html.empty? @doc = Nokogiri(@html) end end - alias_method :css, :xpath - alias_method :css_content, :xpath_content + alias css each_element + alias xpath each_element + alias css_content element_content + alias xpath_content element_content end end