Sha256: 643cfdebc50b598c339c1272004d6c4b8bd1f9aab845920da07ff94f7d172be7
Contents?: true
Size: 1.92 KB
Versions: 2
Compression:
Stored size: 1.92 KB
Contents
class Importer class HtmlReader < DataReader def initialize(importer) super(importer, :html) supports_file! supports_stream! @tables = nil end def init_source(mode, source) if mode == :stream @html = Nokogiri::HTML(source) elsif mode == :file @html = File.open(source) {|f| Nokogiri::HTML(f) } else add_error("Unsupported HTML mode: #{mode}") return false end if @html true else add_error("Failed parsing of HTML") false end rescue Exception => e add_error("Error reading HTML source #{source}: #{e}") false end def load_raw(scopes, &block) # Default to searching all tables in the document if scopes.nil? || scopes.empty? scopes = ['table'] end # Catch here lets us break out of the nested loop cleanly catch(:found) do # Run each scope, which should be a valid css selector scopes.each do |scope| @html.css(scope).each do |table_node| rows = [] table_node.css('tr').each do |row_node| row = [] row_node.children.each do |cell_node| if ['th', 'td'].include?(cell_node.name) row << cell_node.text.strip # Handle col-span values appropriately span_count = cell_node.attr('colspan') (span_count.to_i - 1).times do row << nil end end end rows << row end found = block.call(rows) throw(:found, true) if found end end end rescue Exception => e # Not sure why we'd get here, but we strive for error-freedom here, yessir. add_error("Error loading tables #{scopes.list_join(', ')}: #{e}") end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
iron-import-0.8.1 | lib/iron/import/html_reader.rb |
iron-import-0.8.0 | lib/iron/import/html_reader.rb |