Sha256: ce7b72958c2c63fcc67fba7d6891d075123e5f9c98ef0254ed7e290c82db9f8d

Contents?: true

Size: 1.19 KB

Versions: 2

Compression:

Stored size: 1.19 KB

Contents

require 'nokogiri'
require 'cgi'
class RemoteTable
  class Format
    class HTML < Format
      include Textual
      def each(&blk)
        remove_useless_characters!
        html_headers = (t.properties.headers.is_a?(::Array)) ? t.properties.headers : nil
        ::Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(t.properties.row_xpath).each do |row|
          values = row.xpath(t.properties.column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
          if html_headers.nil?
            html_headers = values
            next
          end
          hash = zip html_headers, values
          yield hash if t.properties.keep_blank_rows or hash.any? { |k, v| v.present? }
        end
      ensure
        t.local_file.delete
      end

      private

      # http://snippets.dzone.com/posts/show/406
      def zip(keys, values)
        hash = ::Hash.new
        keys.zip(values) { |k,v| hash[k]=v }
        hash
      end

      # should we be doing this in ruby?
      def unescaped_html_without_soft_hyphens
        str = ::CGI.unescapeHTML utf8(::IO.read(t.local_file.path))
        # get rid of MS Office baddies
        str.gsub! '&shy;', ''
        str
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
remote_table-1.1.10 lib/remote_table/format/html.rb
remote_table-1.1.9 lib/remote_table/format/html.rb