lib/docparser/document.rb in docparser-0.0.1 vs lib/docparser/document.rb in docparser-0.1.0
- old
+ new
@@ -3,30 +3,33 @@
# The Document class loads and parses the files.
# @see Parser
# @see Output
class Document
attr_reader :filename, :doc, :encoding, :results
- def initialize(filename, encoding: 'utf-8', parser: nil)
+ def initialize(filename: nil, encoding: 'utf-8', parser: nil)
if encoding == 'utf-8'
encodingstring = 'r:utf-8'
else
encodingstring = "r:#{encoding}:utf-8"
end
-
+ @logger = Log4r::Logger.new('docparser::document')
+ @logger.debug { "Parsing #{filename}" }
open(filename, encodingstring) do |f|
- @doc = Nokogiri::HTML(f)
+ @html = f.read
+ @logger.warn "#{filename} is empty" if @html.empty?
+ @doc = Nokogiri(@html)
end
-
@encoding = encoding
@parser = parser
@filename = filename
- @results = Array.new(@parser.outputs.length) { [] }
+ @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
end
# Adds a row to an output
def add_row(*row, output: 0)
output = @parser.outputs.index(output) if output.is_a? Output
+ @logger.debug { "#{filename}: Adding row #{row.flatten.to_s}" }
results[output] << row.flatten
end
# Extracts the document title
# @return [String] the title of the document
@@ -34,17 +37,21 @@
@title ||= xpath_content('//head/title')
end
# @return [String] the source of the document
def html
- @html ||= @doc.inner_html #TODO: ??
+ @html
end
# Executes a xpath query
def xpath(query)
res = @doc.search(query)
- res.each { |el| yield el } if block_given?
+ if block_given?
+ res.each { |el| yield el }
+ else
+ res
+ end
end
# Executes a xpath query and returns the content
# @return [String] the content of the HTML node
def xpath_content(query)
@@ -56,11 +63,11 @@
end
end
# Matches the HTML source using a regular expression
def regexp(regexp)
- html.match(regexp) rescue nil
+ html.match(regexp)
end
# Parses the document
# @return [Array] containing the parse results
def parse!(&block)
@@ -68,12 +75,12 @@
results
end
# @!visibility private
def inspect
- "<Document file:'#{@filename}'>"
+ "<Document file:'#{@filename}', encoding:'#{@encoding}'>"
end
- alias :css :xpath
- alias :css_content :xpath_content
+ alias_method :css, :xpath
+ alias_method :css_content, :xpath_content
end
end
\ No newline at end of file