lib/roo/excelx.rb in roo-1.13.2 vs lib/roo/excelx.rb in roo-2.0.0beta1

- old
+ new

@@ -1,10 +1,20 @@ require 'date' require 'nokogiri' -require 'spreadsheet' +require 'roo/link' +require 'roo/utils' +require 'zip/filesystem' class Roo::Excelx < Roo::Base + autoload :Workbook, 'roo/excelx/workbook' + autoload :SharedStrings, 'roo/excelx/shared_strings' + autoload :Styles, 'roo/excelx/styles' + + autoload :Relationships, 'roo/excelx/relationships' + autoload :Comments, 'roo/excelx/comments' + autoload :SheetDoc, 'roo/excelx/sheet_doc' + module Format EXCEPTIONAL_FORMATS = { 'h:mm am/pm' => :date, 'h:mm:ss am/pm' => :date, } @@ -44,11 +54,11 @@ format = format.to_s.downcase if type = EXCEPTIONAL_FORMATS[format] type elsif format.include?('#') :float - elsif format.include?('d') || format.include?('y') + elsif !format.match(/d+(?![\]])/).nil? || format.include?('y') if format.include?('h') || format.include?('s') :datetime else :date end @@ -62,144 +72,318 @@ end module_function :to_type end - # initialization and opening of a spreadsheet file - # values for packed: :zip - def initialize(filename, options = {}, deprecated_file_warning = :error) - if Hash === options - packed = options[:packed] - file_warning = options[:file_warning] || :error - else - warn 'Supplying `packed` or `file_warning` as separate arguments to `Roo::Excelx.new` is deprecated. Use an options hash instead.' - packed = options - file_warning = deprecated_file_warning + class Cell + attr_reader :type, :formula, :value, :excelx_type, :excelx_value, :style, :hyperlink, :coordinate + attr_writer :value + + def initialize(value, type, formula, excelx_type, excelx_value, style, hyperlink, base_date, coordinate) + @type = type + @formula = formula + @base_date = base_date if [:date, :datetime].include?(@type) + @excelx_type = excelx_type + @excelx_value = excelx_value + @style = style + @value = type_cast_value(value) + @value = Roo::Link.new(hyperlink, @value.to_s) if hyperlink + @coordinate = coordinate end - file_type_check(filename,'.xlsx','an Excel-xlsx', file_warning, packed) - make_tmpdir do |tmpdir| - filename = download_uri(filename, tmpdir) if uri?(filename) - filename = unzip(filename, tmpdir) if packed == :zip - @filename = filename - unless File.file?(@filename) - raise IOError, "file #{@filename} does not exist" + def type + if @formula + :formula + elsif @value.is_a?(Roo::Link) + :link + else + @type end - @comments_files = Array.new - @rels_files = Array.new - extract_content(tmpdir, @filename) - @workbook_doc = load_xml(File.join(tmpdir, "roo_workbook.xml")) - @shared_table = [] - if File.exist?(File.join(tmpdir, 'roo_sharedStrings.xml')) - @sharedstring_doc = load_xml(File.join(tmpdir, 'roo_sharedStrings.xml')) - read_shared_strings(@sharedstring_doc) + end + + class Coordinate + attr_accessor :row, :column + + def initialize(row, column) + @row, @column = row, column end - @styles_table = [] - @style_definitions = Array.new # TODO: ??? { |h,k| h[k] = {} } - if File.exist?(File.join(tmpdir, 'roo_styles.xml')) - @styles_doc = load_xml(File.join(tmpdir, 'roo_styles.xml')) - read_styles(@styles_doc) + end + + private + + def type_cast_value(value) + case @type + when :float, :percentage + value.to_f + when :date + yyyy,mm,dd = (@base_date+value.to_i).strftime("%Y-%m-%d").split('-') + Date.new(yyyy.to_i,mm.to_i,dd.to_i) + when :datetime + create_datetime_from((@base_date+value.to_f.round(6)).strftime("%Y-%m-%d %H:%M:%S.%N")) + when :time + value.to_f*(24*60*60) + when :string + value + else + value end - @sheet_doc = load_xmls(@sheet_files) - @comments_doc = load_xmls(@comments_files) - @rels_doc = load_xmls(@rels_files) end - super(filename, options) - @formula = Hash.new - @excelx_type = Hash.new - @excelx_value = Hash.new - @s_attribute = Hash.new # TODO: ggf. wieder entfernen nur lokal benoetigt - @comment = Hash.new - @comments_read = Hash.new - @hyperlink = Hash.new - @hyperlinks_read = Hash.new + + def create_datetime_from(datetime_string) + date_part,time_part = round_time_from(datetime_string).split(' ') + yyyy,mm,dd = date_part.split('-') + hh,mi,ss = time_part.split(':') + DateTime.civil(yyyy.to_i,mm.to_i,dd.to_i,hh.to_i,mi.to_i,ss.to_i) + end + + def round_time_from(datetime_string) + date_part,time_part = datetime_string.split(' ') + yyyy,mm,dd = date_part.split('-') + hh,mi,ss = time_part.split(':') + Time.new(yyyy.to_i, mm.to_i, dd.to_i, hh.to_i, mi.to_i, ss.to_r).round(0).strftime("%Y-%m-%d %H:%M:%S") + end end - def method_missing(m,*args) - # is method name a label name - read_labels - if @label.has_key?(m.to_s) - sheet ||= @default_sheet - read_cells(sheet) - row,col = label(m.to_s) - cell(row,col) + class Sheet + def initialize(name, rels_path, sheet_path, comments_path, styles, shared_strings, workbook, options = {}) + @name = name + @rels = Relationships.new(rels_path) + @comments = Comments.new(comments_path) + @styles = styles + @sheet = SheetDoc.new(sheet_path, @rels, @styles, shared_strings, workbook, options) + end + + def cells + @cells ||= @sheet.cells(@rels) + end + + def present_cells + @present_cells ||= cells.select {|key, cell| cell && cell.value } + end + + # Yield each row as array of Excelx::Cell objects + # accepts options max_rows (int) (offset by 1 for header) + # and pad_cells (boolean) + def each_row(options = {}, &block) + row_count = 0 + @sheet.each_row_streaming do |row| + break if options[:max_rows] && row_count == options[:max_rows] + 1 + block.call(cells_for_row_element(row, options)) if block_given? + row_count += 1 + end + end + + def row(row_number) + first_column.upto(last_column).map do |col| + cells[[row_number,col]] + end.map {|cell| cell && cell.value } + end + + def column(col_number) + first_row.upto(last_row).map do |row| + cells[[row,col_number]] + end.map {|cell| cell && cell.value } + end + + # returns the number of the first non-empty row + def first_row + @first_row ||= present_cells.keys.map {|row, _| row }.min + end + + def last_row + @last_row ||= present_cells.keys.map {|row, _| row }.max + end + + # returns the number of the first non-empty column + def first_column + @first_column ||= present_cells.keys.map {|_, col| col }.min + end + + # returns the number of the last non-empty column + def last_column + @last_column ||= present_cells.keys.map {|_, col| col }.max + end + + def excelx_format(key) + cell = cells[key] + @styles.style_format(cell.style).to_s if cell + end + + def hyperlinks + @hyperlinks ||= @sheet.hyperlinks(@rels) + end + + def comments + @comments.comments + end + + def dimensions + @sheet.dimensions + end + + private + + # Take an xml row and return an array of Excelx::Cell objects + # optionally pad array to header width(assumed 1st row). + # takes option pad_cells (boolean) defaults false + def cells_for_row_element(row_element, options = {}) + return [] unless row_element + cell_col = 0 + cells = [] + @sheet.each_cell(row_element) do |cell| + cells.concat(pad_cells(cell, cell_col)) if options[:pad_cells] + cells << cell + cell_col = cell.coordinate.column + end + cells + end + + def pad_cells(cell, last_column) + pad = [] + (cell.coordinate.column - 1 - last_column).times { pad << nil } + pad + end + end + + ExceedsMaxError = Class.new(StandardError) + + # initialization and opening of a spreadsheet file + # values for packed: :zip + # optional cell_max (int) parameter for early aborting attempts to parse + # enormous documents. + def initialize(filename, options = {}) + packed = options[:packed] + file_warning = options.fetch(:file_warning, :error) + cell_max = options.delete(:cell_max) + sheet_options = {} + sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false) + + file_type_check(filename,'.xlsx','an Excel-xlsx', file_warning, packed) + + @tmpdir = make_tmpdir(filename.split('/').last, options[:tmpdir_root]) + @filename = local_filename(filename, @tmpdir, packed) + @comments_files = [] + @rels_files = [] + process_zipfile(@tmpdir, @filename) + + @sheet_names = workbook.sheets.map do |sheet| + unless options[:only_visible_sheets] && sheet['state'] == 'hidden' + sheet['name'] + end + end.compact + @sheets = [] + @sheets_by_name = Hash[@sheet_names.map.with_index do |sheet_name, n| + @sheets[n] = Sheet.new(sheet_name, @rels_files[n], @sheet_files[n], @comments_files[n], styles, shared_strings, workbook, sheet_options) + [sheet_name, @sheets[n]] + end] + + if cell_max + cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions) + raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max + end + + super + end + + def method_missing(method,*args) + if label = workbook.defined_names[method.to_s] + safe_send(sheet_for(label.sheet).cells[label.key], :value) else # call super for methods like #a1 super end end + def sheets + @sheet_names + end + + def sheet_for(sheet) + sheet ||= default_sheet + validate_sheet!(sheet) + @sheets_by_name[sheet] + end + # Returns the content of a spreadsheet-cell. # (1,1) is the upper left corner. # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the # cell at the first line and first row. def cell(row, col, sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - row,col = normalize(row,col) - if celltype(row,col,sheet) == :date - yyyy,mm,dd = @cell[sheet][[row,col]].split('-') - return Date.new(yyyy.to_i,mm.to_i,dd.to_i) - elsif celltype(row,col,sheet) == :datetime - date_part,time_part = @cell[sheet][[row,col]].split(' ') - yyyy,mm,dd = date_part.split('-') - hh,mi,ss = time_part.split(':') - return DateTime.civil(yyyy.to_i,mm.to_i,dd.to_i,hh.to_i,mi.to_i,ss.to_i) + key = normalize(row,col) + safe_send(sheet_for(sheet).cells[key], :value) + end + + def row(rownumber,sheet=nil) + sheet_for(sheet).row(rownumber) + end + + # returns all values in this column as an array + # column numbers are 1,2,3,... like in the spreadsheet + def column(column_number,sheet=nil) + if column_number.is_a?(::String) + column_number = ::Roo::Utils.letter_to_number(column_number) end - @cell[sheet][[row,col]] + sheet_for(sheet).column(column_number) end + # returns the number of the first non-empty row + def first_row(sheet=nil) + sheet_for(sheet).first_row + end + + # returns the number of the last non-empty row + def last_row(sheet=nil) + sheet_for(sheet).last_row + end + + # returns the number of the first non-empty column + def first_column(sheet=nil) + sheet_for(sheet).first_column + end + + # returns the number of the last non-empty column + def last_column(sheet=nil) + sheet_for(sheet).last_column + end + + # set a cell to a certain value + # (this will not be saved back to the spreadsheet file!) + def set(row,col,value, sheet = nil) #:nodoc: + key = normalize(row,col) + cell_type = cell_type_by_value(value) + sheet_for(sheet).cells[key] = Cell.new(value, cell_type, nil, cell_type, value, nil, nil, nil, Cell::Coordinate.new(row, col)) + end + + # Returns the formula at (row,col). # Returns nil if there is no formula. # The method #formula? checks if there is a formula. def formula(row,col,sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - row,col = normalize(row,col) - @formula[sheet][[row,col]] && @formula[sheet][[row,col]] + key = normalize(row,col) + safe_send(sheet_for(sheet).cells[key], :formula) end - alias_method :formula?, :formula - # returns each formula in the selected sheet as an array of elements - # [row, col, formula] - def formulas(sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - if @formula[sheet] - @formula[sheet].each.collect do |elem| - [elem[0][0], elem[0][1], elem[1]] - end - else - [] - end + # Predicate methods really should return a boolean + # value. Hopefully no one was relying on the fact that this + # previously returned either nil/formula + def formula?(*args) + !!formula(*args) end - class Font - attr_accessor :bold, :italic, :underline - - def bold? - @bold == true + # returns each formula in the selected sheet as an array of tuples in following format + # [[row, col, formula], [row, col, formula],...] + def formulas(sheet=nil) + sheet_for(sheet).cells.select {|_, cell| cell.formula }.map do |(x, y), cell| + [x, y, cell.formula] end - - def italic? - @italic == true - end - - def underline? - @underline == true - end end # Given a cell, return the cell's style def font(row, col, sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - row,col = normalize(row,col) - s_attribute = @s_attribute[sheet][[row,col]] - s_attribute ||= 0 - s_attribute = s_attribute.to_i - @style_definitions[s_attribute] + key = normalize(row,col) + definition_index = safe_send(sheet_for(sheet).cells[key], :style) + styles.definitions[definition_index] if definition_index end # returns the type of a cell: # * :float # * :string, @@ -207,468 +391,252 @@ # * :percentage # * :formula # * :time # * :datetime def celltype(row,col,sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - row,col = normalize(row,col) - if @formula[sheet][[row,col]] - return :formula - else - @cell_type[sheet][[row,col]] - end + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :type) end # returns the internal type of an excel cell # * :numeric_or_formula # * :string # Note: this is only available within the Excelx class def excelx_type(row,col,sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - row,col = normalize(row,col) - return @excelx_type[sheet][[row,col]] + key = normalize(row,col) + safe_send(sheet_for(sheet).cells[key], :excelx_type) end # returns the internal value of an excelx cell # Note: this is only available within the Excelx class def excelx_value(row,col,sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - row,col = normalize(row,col) - return @excelx_value[sheet][[row,col]] + key = normalize(row,col) + safe_send(sheet_for(sheet).cells[key], :excelx_value) end # returns the internal format of an excel cell def excelx_format(row,col,sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - row,col = normalize(row,col) - s = @s_attribute[sheet][[row,col]] - attribute2format(s).to_s + key = normalize(row,col) + sheet_for(sheet).excelx_format(key) end - # returns an array of sheet names in the spreadsheet - def sheets - @workbook_doc.xpath("//xmlns:sheet").map do |sheet| - sheet['name'] - end + def empty?(row,col,sheet=nil) + sheet = sheet_for(sheet) + key = normalize(row,col) + cell = sheet.cells[key] + !cell || !cell.value || (cell.type == :string && cell.value.empty?) \ + || (row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column) end # shows the internal representation of all cells # for debugging purposes def to_s(sheet=nil) - sheet ||= @default_sheet - read_cells(sheet) - @cell[sheet].inspect + sheet_for(sheet).cells.inspect end # returns the row,col values of the labelled cell # (nil,nil) if label is not defined - def label(labelname) - read_labels - if @label.empty? || !@label.has_key?(labelname) - return nil,nil,nil + def label(name) + labels = workbook.defined_names + if labels.empty? || !labels.key?(name) + [nil,nil,nil] else - return @label[labelname][1].to_i, - Roo::Base.letter_to_number(@label[labelname][2]), - @label[labelname][0] + [labels[name].row, + labels[name].col, + labels[name].sheet] end end # Returns an array which all labels. Each element is an array with # [labelname, [row,col,sheetname]] def labels - # sheet ||= @default_sheet - # read_cells(sheet) - read_labels - @label.map do |label| - [ label[0], # name - [ label[1][1].to_i, # row - Roo::Base.letter_to_number(label[1][2]), # column - label[1][0], # sheet + @labels ||= workbook.defined_names.map do |name, label| + [ name, + [ label.row, + label.col, + label.sheet, ] ] end end def hyperlink?(row,col,sheet=nil) - hyperlink(row, col, sheet) != nil + !!hyperlink(row, col, sheet) end # returns the hyperlink at (row/col) # nil if there is no hyperlink def hyperlink(row,col,sheet=nil) - sheet ||= @default_sheet - read_hyperlinks(sheet) unless @hyperlinks_read[sheet] - row,col = normalize(row,col) - return nil unless @hyperlink[sheet] - @hyperlink[sheet][[row,col]] + key = normalize(row,col) + sheet_for(sheet).hyperlinks[key] end # returns the comment at (row/col) # nil if there is no comment def comment(row,col,sheet=nil) - sheet ||= @default_sheet - #read_cells(sheet) - read_comments(sheet) unless @comments_read[sheet] - row,col = normalize(row,col) - return nil unless @comment[sheet] - @comment[sheet][[row,col]] + key = normalize(row,col) + sheet_for(sheet).comments[key] end # true, if there is a comment def comment?(row,col,sheet=nil) - sheet ||= @default_sheet - # read_cells(sheet) - read_comments(sheet) unless @comments_read[sheet] - row,col = normalize(row,col) - comment(row,col) != nil + !!comment(row,col,sheet) end - # returns each comment in the selected sheet as an array of elements - # [row, col, comment] def comments(sheet=nil) - sheet ||= @default_sheet - read_comments(sheet) unless @comments_read[sheet] - if @comment[sheet] - @comment[sheet].each.collect do |elem| - [elem[0][0],elem[0][1],elem[1]] - end - else - [] + sheet_for(sheet).comments.map do |(x, y), comment| + [x, y, comment] end end + # Yield an array of Excelx::Cell + # Takes options for sheet, pad_cells, and max_rows + def each_row_streaming(options={}) + sheet_for(options.delete(:sheet)).each_row(options) { |row| yield row } + end + private - def load_xmls(paths) - paths.compact.map do |item| - load_xml(item) + def clean_sheet(sheet) + @sheets_by_name[sheet].cells.each_pair do |coord, value| + next unless value.value.is_a?(::String) + + @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value) end + + @cleaned[sheet] = true end - # helper function to set the internal representation of cells - def set_cell_values(sheet,x,y,i,v,value_type,formula, - excelx_type=nil, - excelx_value=nil, - s_attribute=nil) - key = [y,x+i] - @cell_type[sheet] ||= {} - @cell_type[sheet][key] = value_type - @formula[sheet] ||= {} - @formula[sheet][key] = formula if formula - @cell[sheet] ||= {} - @cell[sheet][key] = - case @cell_type[sheet][key] - when :float - v.to_f - when :string - v - when :date - (base_date+v.to_i).strftime("%Y-%m-%d") - when :datetime - (base_date+v.to_f).strftime("%Y-%m-%d %H:%M:%S") - when :percentage - v.to_f - when :time - v.to_f*(24*60*60) - else - v - end + # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx + # documents require a workbook.xml file, so a if the file is missing + # it is not a valid xlsx file. In these cases, an ArgumentError is + # raised. + # + # wb - a Zip::Entry for the workbook.xml file. + # path - A String for Zip::Entry's destination path. + # + # Examples + # + # extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml') + # # => ["rId1", "rId2", "rId3"] + # + # Returns an Array of Strings. + def extract_worksheet_ids(entries, path) + wb = entries.find { |e| e.name[/workbook.xml$/] } + fail ArgumentError 'missing required workbook file' if wb.nil? - @cell[sheet][key] = Spreadsheet::Link.new(@hyperlink[sheet][key], @cell[sheet][key].to_s) if hyperlink?(y,x+i) - @excelx_type[sheet] ||= {} - @excelx_type[sheet][key] = excelx_type - @excelx_value[sheet] ||= {} - @excelx_value[sheet][key] = excelx_value - @s_attribute[sheet] ||= {} - @s_attribute[sheet][key] = s_attribute + wb.extract(path) + workbook_doc = Roo::Utils.load_xml(path).remove_namespaces! + workbook_doc.xpath('//sheet').map{ |s| s.attributes['id'].value } end - # read all cells in the selected sheet - def read_cells(sheet=nil) - sheet ||= @default_sheet - validate_sheet!(sheet) - return if @cells_read[sheet] + # Internal + # + # wb_rels - A Zip::Entry for the workbook.xml.rels file. + # path - A String for the Zip::Entry's destination path. + # + # Examples + # + # extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels') + # # => { + # "rId1"=>"worksheets/sheet1.xml", + # "rId2"=>"worksheets/sheet2.xml", + # "rId3"=>"worksheets/sheet3.xml" + # } + # + # Returns a Hash. + def extract_worksheet_rels(entries, path) + wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] } + fail ArgumentError 'missing required workbook file' if wb_rels.nil? - @sheet_doc[sheets.index(sheet)].xpath("/xmlns:worksheet/xmlns:sheetData/xmlns:row/xmlns:c").each do |c| - s_attribute = c['s'].to_i # should be here - # c: <c r="A5" s="2"> - # <v>22606</v> - # </c>, format: , tmp_type: float - value_type = - case c['t'] - when 's' - :shared - when 'b' - :boolean - # 2011-02-25 BEGIN - when 'str' - :string - # 2011-02-25 END - # 2011-09-15 BEGIN - when 'inlineStr' - :inlinestr - # 2011-09-15 END - else - format = attribute2format(s_attribute) - Format.to_type(format) - end - formula = nil - c.children.each do |cell| - case cell.name - when 'is' - cell.children.each do |is| - if is.name == 't' - inlinestr_content = is.content - value_type = :string - v = inlinestr_content - excelx_type = :string - y, x = Roo::Base.split_coordinate(c['r']) - excelx_value = inlinestr_content #cell.content - set_cell_values(sheet,x,y,0,v,value_type,formula,excelx_type,excelx_value,s_attribute) - end - end - when 'f' - formula = cell.content - when 'v' - if [:time, :datetime].include?(value_type) && cell.content.to_f >= 1.0 - value_type = - if (cell.content.to_f - cell.content.to_f.floor).abs > 0.000001 - :datetime - else - :date - end - end - excelx_type = [:numeric_or_formula,format.to_s] - excelx_value = cell.content - v = - case value_type - when :shared - value_type = :string - excelx_type = :string - @shared_table[cell.content.to_i] - when :boolean - (cell.content.to_i == 1 ? 'TRUE' : 'FALSE') - when :date - cell.content - when :time - cell.content - when :datetime - cell.content - when :formula - cell.content.to_f #TODO: !!!! - when :string - excelx_type = :string - cell.content - else - value_type = :float - cell.content - end - y, x = Roo::Base.split_coordinate(c['r']) - set_cell_values(sheet,x,y,0,v,value_type,formula,excelx_type,excelx_value,s_attribute) - end - end + wb_rels.extract(path) + rels_doc = Roo::Utils.load_xml(path).remove_namespaces! + worksheet_type ='http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet' + + relationships = rels_doc.xpath('//Relationship').select do |relationship| + relationship.attributes['Type'].value == worksheet_type end - @cells_read[sheet] = true - # begin comments -=begin -Datei xl/comments1.xml - <?xml version="1.0" encoding="UTF-8" standalone="yes" ?> - <comments xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"> - <authors> - <author /> - </authors> - <commentList> - <comment ref="B4" authorId="0"> - <text> - <r> - <rPr> - <sz val="10" /> - <rFont val="Arial" /> - <family val="2" /> - </rPr> - <t>Kommentar fuer B4</t> - </r> - </text> - </comment> - <comment ref="B5" authorId="0"> - <text> - <r> - <rPr> - <sz val="10" /> - <rFont val="Arial" /> - <family val="2" /> - </rPr> - <t>Kommentar fuer B5</t> - </r> - </text> - </comment> - </commentList> - </comments> -=end -=begin - if @comments_doc[self.sheets.index(sheet)] - read_comments(sheet) - end -=end - #end comments - end - # Reads all comments from a sheet - def read_comments(sheet=nil) - sheet ||= @default_sheet - validate_sheet!(sheet) - n = self.sheets.index(sheet) - return unless @comments_doc[n] #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - @comments_doc[n].xpath("//xmlns:comments/xmlns:commentList/xmlns:comment").each do |comment| - ref = comment.attributes['ref'].to_s - row,col = Roo::Base.split_coordinate(ref) - comment.xpath('./xmlns:text/xmlns:r/xmlns:t').each do |text| - @comment[sheet] ||= {} - @comment[sheet][[row,col]] = text.text - end + relationships.inject({}) do |hash, relationship| + attributes = relationship.attributes + id = attributes['Id']; + hash[id.value] = attributes['Target'].value + hash end - @comments_read[sheet] = true end - # Reads all hyperlinks from a sheet - def read_hyperlinks(sheet=nil) - sheet ||= @default_sheet - validate_sheet!(sheet) - n = self.sheets.index(sheet) - if rels_doc = @rels_doc[n] - rels = Hash[rels_doc.xpath("/xmlns:Relationships/xmlns:Relationship").map do |r| - [r.attribute('Id').text, r] - end] - @sheet_doc[n].xpath("/xmlns:worksheet/xmlns:hyperlinks/xmlns:hyperlink").each do |h| - if rel_element = rels[h.attribute('id').text] - row,col = Roo::Base.split_coordinate(h.attributes['ref'].to_s) - @hyperlink[sheet] ||= {} - @hyperlink[sheet][[row,col]] = rel_element.attribute('Target').text - end - end + def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir) + sheet_ids.each_with_index do |id, i| + name = sheets[id] + entry = entries.find { |entry| entry.name =~ /#{name}$/ } + path = "#{tmpdir}/roo_sheet#{i + 1}" + @sheet_files << path + entry.extract(path) end - @hyperlinks_read[sheet] = true end - def read_labels - @label ||= Hash[@workbook_doc.xpath("//xmlns:definedName").map do |defined_name| - # "Sheet1!$C$5" - sheet, coordinates = defined_name.text.split('!$', 2) - col,row = coordinates.split('$') - [defined_name['name'], [sheet,row,col]] - end] - end - # Extracts all needed files from the zip file - def process_zipfile(tmpdir, zipfilename, zip, path='') + def process_zipfile(tmpdir, zipfilename) @sheet_files = [] - Roo::ZipFile.open(zipfilename) {|zf| - zf.entries.each {|entry| - entry_name = entry.to_s.downcase + entries = Zip::File.open(zipfilename).to_a.sort_by(&:name) - path = - if entry_name.end_with?('workbook.xml') - "#{tmpdir}/roo_workbook.xml" - elsif entry_name.end_with?('sharedstrings.xml') - "#{tmpdir}/roo_sharedStrings.xml" - elsif entry_name.end_with?('styles.xml') - "#{tmpdir}/roo_styles.xml" - elsif entry_name =~ /sheet([0-9]+).xml$/ - nr = $1 - @sheet_files[nr.to_i-1] = "#{tmpdir}/roo_sheet#{nr}" - elsif entry_name =~ /comments([0-9]+).xml$/ - nr = $1 - @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}" - elsif entry_name =~ /sheet([0-9]+).xml.rels$/ - nr = $1 - @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}" - end - if path - extract_file(zip, entry, path) - end - } - } - end + # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames + # are not in order. With Numbers 3.1, the first sheet is always + # sheet.xml, not sheet1.xml. With Google, the order of the worksheets is + # independent of a worksheet's filename (i.e. sheet6.xml can be the + # first worksheet). + # + # workbook.xml lists the correct order of worksheets and + # workbook.xml.rels lists the filenames for those worksheets. + # + # workbook.xml: + # <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/> + # <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/> + # workbook.xml.rel: + # <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/> + # <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/> + sheet_ids = extract_worksheet_ids(entries, "#{tmpdir}/roo_workbook.xml") + sheets = extract_worksheet_rels(entries, "#{tmpdir}/roo_workbook.xml.rels") + extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir) - def extract_file(source_zip, entry, destination_path) - open(destination_path,'wb') {|f| - f << source_zip.read(entry) - } - end - - # extract files from the zip file - def extract_content(tmpdir, zipfilename) - Roo::ZipFile.open(@filename) do |zip| - process_zipfile(tmpdir, zipfilename,zip) - end - end - - # read the shared strings xml document - def read_shared_strings(doc) - doc.xpath("/xmlns:sst/xmlns:si").each do |si| - shared_table_entry = '' - si.children.each do |elem| - if elem.name == 'r' and elem.children - elem.children.each do |r_elem| - if r_elem.name == 't' - shared_table_entry << r_elem.content - end - end - end - if elem.name == 't' - shared_table_entry = elem.content - end + entries.each do |entry| + path = + case entry.name.downcase + when /sharedstrings.xml$/ + "#{tmpdir}/roo_sharedStrings.xml" + when /styles.xml$/ + "#{tmpdir}/roo_styles.xml" + when /comments([0-9]+).xml$/ + # FIXME: Most of the time, The order of the comment files are the same + # the sheet order, i.e. sheet1.xml's comments are in comments1.xml. + # In some situations, this isn't true. The true location of a + # sheet's comment file is in the sheet1.xml.rels file. SEE + # ECMA-376 12.3.3 in "Ecma Office Open XML Part 1". + nr = Regexp.last_match[1].to_i + @comments_files[nr - 1] = "#{tmpdir}/roo_comments#{nr}" + when /sheet([0-9]+).xml.rels$/ + # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but + # it also stores the location for sharedStrings, comments, + # drawings, etc. + nr = Regexp.last_match[1].to_i + @rels_files[nr - 1] = "#{tmpdir}/roo_rels#{nr}" end - @shared_table << shared_table_entry - end - end - # read the styles elements of an excelx document - def read_styles(doc) - @cellXfs = [] - - @numFmts = Hash[doc.xpath("//xmlns:numFmt").map do |numFmt| - [numFmt['numFmtId'], numFmt['formatCode']] - end] - fonts = doc.xpath("//xmlns:fonts/xmlns:font").map do |font_el| - Font.new.tap do |font| - font.bold = !font_el.xpath('./xmlns:b').empty? - font.italic = !font_el.xpath('./xmlns:i').empty? - font.underline = !font_el.xpath('./xmlns:u').empty? - end + entry.extract(path) if path end - - doc.xpath("//xmlns:cellXfs").each do |xfs| - xfs.children.each do |xf| - @cellXfs << xf['numFmtId'] - @style_definitions << fonts[xf['fontId'].to_i] - end - end end - # convert internal excelx attribute to a format - def attribute2format(s) - id = @cellXfs[s.to_i] - @numFmts[id] || Format::STANDARD_FORMATS[id.to_i] + def styles + @styles ||= Styles.new(File.join(@tmpdir, 'roo_styles.xml')) end - def base_date - @base_date ||= read_base_date + def shared_strings + @shared_strings ||= SharedStrings.new(File.join(@tmpdir, 'roo_sharedStrings.xml')) end - # Default to 1900 (minus one day due to excel quirk) but use 1904 if - # it's set in the Workbook's workbookPr - # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx - def read_base_date - base_date = Date.new(1899,12,30) - @workbook_doc.xpath("//xmlns:workbookPr").map do |workbookPr| - if workbookPr["date1904"] && workbookPr["date1904"] =~ /true|1/i - base_date = Date.new(1904,01,01) - end - end - base_date + def workbook + @workbook ||= Workbook.new(File.join(@tmpdir, "roo_workbook.xml")) end -end # class + def safe_send(object, method, *args) + object.send(method, *args) if object && object.respond_to?(method) + end +end