lib/docparser/document.rb in docparser-0.2.3 vs lib/docparser/document.rb in docparser-0.3.0
- old
+ new
@@ -1,5 +1,7 @@
+# frozen_string_literal: true
+
require 'nokogiri'
module DocParser
# The Document class loads and parses the files.
# @see Parser
# @see Output
@@ -17,46 +19,52 @@
attr_reader :results
# @return [String] the source of the document
attr_reader :html
- def initialize(filename: nil, encoding: 'utf-8', parser: nil)
- @logger = Log4r::Logger.new('docparser::document')
- @logger.debug { "Parsing #{filename}" }
+ def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil)
+ @logger = logger || Logger.new(STDERR)
+ @logger.level = Logger::INFO
+ @logger.debug("Parsing #{filename}")
@encoding = encoding
@parser = parser
@filename = filename
@results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
read_file
end
# Adds a row to an output
def add_row(*row, output: 0)
output = @parser.outputs.index(output) if output.is_a? Output
- @logger.debug { "#{filename}: Adding row #{row.flatten}" }
+ @logger.debug("#{filename}: Adding row #{row.flatten}")
results[output] << row.flatten
end
# Extracts the document title
# @return [String] the title of the document
def title
@title ||= xpath_content('//head/title')
end
- # Executes a xpath query
- def xpath(query)
- res = @doc.search(query)
+ # Executes a xpath/css query
+ def elements(query)
+ @doc.search(query)
+ end
+
+ def each_element(query)
+ res = elements(query)
+
if block_given?
res.each { |el| yield el }
else
res
end
end
# Executes a xpath query and returns the content
# @return [String] the content of the HTML node
- def xpath_content(query)
+ def element_content(query)
first = @doc.search(query).first
if first.nil?
nil
else
first.content
@@ -89,9 +97,11 @@
@logger.warn "#{filename} is empty" if @html.empty?
@doc = Nokogiri(@html)
end
end
- alias_method :css, :xpath
- alias_method :css_content, :xpath_content
+ alias css each_element
+ alias xpath each_element
+ alias css_content element_content
+ alias xpath_content element_content
end
end