Sha256: c08b80ad4e552cb379f501778e102a6ecf78bad57f9dc6a9c5fa89e5b799c665

Contents?: true

Size: 1.23 KB

Versions: 7

Compression:

Stored size: 1.23 KB

Contents

require 'nokogiri'
require 'cgi'
class RemoteTable
  class Format
    class HTML < Format
      include Textual
      def each(&blk)
        backup_file!
        convert_file_to_utf8!
        remove_useless_characters!
        html_headers = (t.properties.headers.is_a?(::Array)) ? t.properties.headers : nil
        ::Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(t.properties.row_xpath).each do |row|
          values = row.xpath(t.properties.column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
          if html_headers.nil?
            html_headers = values
            next
          end
          hash = zip html_headers, values
          yield hash if t.properties.keep_blank_rows or hash.any? { |k, v| v.present? }
        end
      ensure
        restore_file!
      end

      private

      # http://snippets.dzone.com/posts/show/406
      def zip(keys, values)
        hash = ::Hash.new
        keys.zip(values) { |k,v| hash[k]=v }
        hash
      end

      # should we be doing this in ruby?
      def unescaped_html_without_soft_hyphens
        str = ::CGI.unescapeHTML ::IO.read(t.local_file.path)
        # get rid of MS Office baddies
        str.gsub! /&shy;/, ''
        str
      end
    end
  end
end

Version data entries

7 entries across 7 versions & 1 rubygems

Version Path
remote_table-1.1.2 lib/remote_table/format/html.rb
remote_table-1.1.1 lib/remote_table/format/html.rb
remote_table-1.1.0 lib/remote_table/format/html.rb
remote_table-1.0.3 lib/remote_table/format/html.rb
remote_table-1.0.2 lib/remote_table/format/html.rb
remote_table-1.0.1 lib/remote_table/format/html.rb
remote_table-1.0.0 lib/remote_table/format/html.rb