Sha256: 643cfdebc50b598c339c1272004d6c4b8bd1f9aab845920da07ff94f7d172be7

Contents?: true

Size: 1.92 KB

Versions: 2

Compression:

Stored size: 1.92 KB

Contents

class Importer
  
  class HtmlReader < DataReader
    
    def initialize(importer)
      super(importer, :html)
      supports_file!
      supports_stream!
      @tables = nil
    end
    
    def init_source(mode, source)
      if mode == :stream
        @html = Nokogiri::HTML(source)
      elsif mode == :file
        @html = File.open(source) {|f| Nokogiri::HTML(f) }
      else
        add_error("Unsupported HTML mode: #{mode}")
        return false
      end
      
      if @html
        true
      else
        add_error("Failed parsing of HTML")
        false
      end
      
    rescue Exception => e
      add_error("Error reading HTML source #{source}: #{e}")
      false
    end
    
    def load_raw(scopes, &block)
      # Default to searching all tables in the document
      if scopes.nil? || scopes.empty?
        scopes = ['table']
      end
      
      # Catch here lets us break out of the nested loop cleanly
      catch(:found) do
        # Run each scope, which should be a valid css selector
        scopes.each do |scope|
          @html.css(scope).each do |table_node|
            rows = []
            table_node.css('tr').each do |row_node|
              row = []
              row_node.children.each do |cell_node|
                if ['th', 'td'].include?(cell_node.name)
                  row << cell_node.text.strip
                  # Handle col-span values appropriately
                  span_count = cell_node.attr('colspan')
                  (span_count.to_i - 1).times do 
                    row << nil
                  end
                end
              end
              rows << row
            end
            found = block.call(rows)
            throw(:found, true) if found
          end
        end
      end

    rescue Exception => e
      # Not sure why we'd get here, but we strive for error-freedom here, yessir.
      add_error("Error loading tables #{scopes.list_join(', ')}: #{e}")
    end
  
  end
  
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
iron-import-0.8.1 lib/iron/import/html_reader.rb
iron-import-0.8.0 lib/iron/import/html_reader.rb