lib/dullard/reader.rb in dullard-0.1.0 vs lib/dullard/reader.rb in dullard-0.2.0

- old
+ new

@@ -1,21 +1,26 @@ require 'zip/filesystem' require 'nokogiri' -module Dullard; end +module Dullard + class Error < StandardError; end + OOXMLEpoch = DateTime.new(1899,12,30) + SharedStringPath = 'xl/sharedStrings.xml' + StylesPath = 'xl/styles.xml' +end class Dullard::Workbook # Code borrowed from Roo (https://github.com/hmcgowan/roo/blob/master/lib/roo/excelx.rb) # Some additional formats added by Paul Hendryx (phendryx@gmail.com) that are common in LibreOffice. FORMATS = { 'general' => :float, '0' => :float, '0.00' => :float, '#,##0' => :float, '#,##0.00' => :float, - '0%' => :percentage, - '0.00%' => :percentage, + '0%' => :float, + '0.00%' => :float, '0.00E+00' => :float, '# ?/?' => :float, #??? TODO: '# ??/??' => :float, #??? TODO: 'mm-dd-yy' => :date, 'd-mmm-yy' => :date, @@ -76,28 +81,46 @@ 49 => '@', } def initialize(file, user_defined_formats = {}) @file = file - @zipfs = Zip::File.open(@file) + begin + @zipfs = Zip::File.open(@file) + rescue Zip::Error => e + raise Dullard::Error, e.message + end @user_defined_formats = user_defined_formats read_styles end def sheets - workbook = Nokogiri::XML::Document.parse(@zipfs.file.open("xl/workbook.xml")) - @sheets = workbook.css("sheet").each_with_index.map {|n,i| Dullard::Sheet.new(self, n.attr("name"), n.attr("sheetId"), i+1) } + begin + workbook = Nokogiri::XML::Document.parse(@zipfs.file.open('xl/workbook.xml')) + rescue Zip::Error + raise Dullard::Error, 'Invalid file, could not open xl/workbook.xml' + end + @sheets = workbook.css('sheet').each_with_index.map do |n, i| + Dullard::Sheet.new(self, n.attr('name'), n.attr('sheetId'), i+1) + end end def string_table - @string_tabe ||= read_string_table + @string_table ||= read_string_table end def read_string_table - @string_table = [] + return [] unless @zipfs.file.exist? Dullard::SharedStringPath + + begin + shared_string = @zipfs.file.open(Dullard::SharedStringPath) + rescue Zip::Error + raise Dullard::Error, 'Invalid file, could not open shared string file.' + end + entry = '' - Nokogiri::XML::Reader(@zipfs.file.open("xl/sharedStrings.xml")).each do |node| + @string_table = [] + Nokogiri::XML::Reader(shared_string).each do |node| if node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT entry = '' elsif node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT @string_table << entry elsif node.value? @@ -106,24 +129,33 @@ end @string_table end def read_styles - doc = Nokogiri::XML(@zipfs.file.open("xl/styles.xml")) - @num_formats = {} @cell_xfs = [] + return unless @zipfs.file.exist? Dullard::StylesPath + + begin + doc = Nokogiri::XML(@zipfs.file.open(Dullard::StylesPath)) + rescue Zip::Error + raise Dullard::Error, 'Invalid file, could not open styles' + end doc.css('/styleSheet/numFmts/numFmt').each do |numFmt| - numFmtId = numFmt.attributes['numFmtId'].value.to_i - formatCode = numFmt.attributes['formatCode'].value - @num_formats[numFmtId] = formatCode + if numFmt.attributes['numFmtId'] && numFmt.attributes['formatCode'] + numFmtId = numFmt.attributes['numFmtId'].value.to_i + formatCode = numFmt.attributes['formatCode'].value + @num_formats[numFmtId] = formatCode + end end doc.css('/styleSheet/cellXfs/xf').each do |xf| - numFmtId = xf.attributes['numFmtId'].value.to_i - @cell_xfs << numFmtId + if xf.attributes['numFmtId'] + numFmtId = xf.attributes['numFmtId'].value.to_i + @cell_xfs << numFmtId + end end end # Code borrowed from Roo (https://github.com/hmcgowan/roo/blob/master/lib/roo/excelx.rb) @@ -166,79 +198,94 @@ def initialize(workbook, name, id, index) @workbook = workbook @name = name @id = id @index = index - @file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path) + begin + @file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path) + rescue Zip::Error => e + raise Dullard::Error, "Couldn't open sheet #{index}: #{e.message}" + end end def string_lookup(i) - @workbook.string_table[i] + @workbook.string_table[i] || (raise Dullard::Error, 'File invalid, invalid string table.') end def rows Enumerator.new(row_count) do |y| next unless @file @file.rewind shared = false row = nil + cell_map = nil # Map of column letter to cell value for a row column = nil cell_type = nil Nokogiri::XML::Reader(@file).each do |node| case node.node_type when Nokogiri::XML::Reader::TYPE_ELEMENT case node.name when "row" - row = [] - column = 0 + cell_map = {} next - when "c" - if node.attributes['t'] != 's' && node.attributes['t'] != 'b' + when 'c' + node_type = node.attributes['t'] + cell_index = node.attributes['r'] + if !cell_index + raise Dullard::Error, 'Invalid spreadsheet XML.' + end + + if node_type != 's' && node_type != 'b' cell_format_index = node.attributes['s'].to_i cell_type = @workbook.format2type(@workbook.attribute2format(cell_format_index)) end - rcolumn = node.attributes["r"] - if rcolumn - rcolumn.delete!("0-9") - while column < self.class.column_names.size and rcolumn != self.class.column_names[column] - row << nil - column += 1 - end - end - shared = (node.attribute("t") == "s") - column += 1 + column = cell_index.delete('0-9') + shared = (node_type == 's') next end when Nokogiri::XML::Reader::TYPE_END_ELEMENT - if node.name == "row" - y << row - next + if node.name == 'row' + y << process_row(cell_map) end + next end - value = node.value - if value + if node.value + value = (shared ? string_lookup(value.to_i) : value) case cell_type when :datetime when :time when :date - value = (DateTime.new(1899,12,30) + value.to_f) - when :percentage # ? TODO + value = (Dullard::OOXMLEpoch + node.value.to_f) when :float - value = value.to_f + value = node.value.to_f else # leave as string end cell_type = nil - - row << (shared ? string_lookup(value.to_i) : value) + cell_map[column] = value end end end end + def process_row(cell_map) + max = cell_map.keys.map {|c| self.class.column_name_to_index c }.max + row = [] + self.class.column_names[0..max].each do |col| + if self.class.column_name_to_index(col) > max + break + else + row << cell_map[col] + end + end + row + end + + + # Returns A to ZZZ. def self.column_names if @column_names @column_names else @@ -250,22 +297,32 @@ z = y.map(&proc).flatten @column_names = x + y + z end end + def self.column_name_to_index(name) + if not @column_names_to_indices + @column_names_to_indices = {} + self.column_names.each_with_index do |name, i| + @column_names_to_indices[name] = i + end + end + @column_names_to_indices[name] + end + def row_count if defined? @row_count @row_count elsif @file @file.rewind Nokogiri::XML::Reader(@file).each do |node| if node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT case node.name - when "dimension" + when 'dimension' if ref = node.attributes["ref"] break @row_count = ref.scan(/\d+$/).first.to_i end - when "sheetData" + when 'sheetData' break @row_count = nil end end end end