excelx.rb in roo-2.0.0beta1

- old
+ new

@@ -1,10 +1,20 @@
 require 'date'
 require 'nokogiri'
-require 'spreadsheet'
+require 'roo/link'
+require 'roo/utils'
+require 'zip/filesystem'
 
 class Roo::Excelx < Roo::Base
+  autoload :Workbook, 'roo/excelx/workbook'
+  autoload :SharedStrings, 'roo/excelx/shared_strings'
+  autoload :Styles, 'roo/excelx/styles'
+
+  autoload :Relationships, 'roo/excelx/relationships'
+  autoload :Comments, 'roo/excelx/comments'
+  autoload :SheetDoc, 'roo/excelx/sheet_doc'
+
   module Format
     EXCEPTIONAL_FORMATS = {
       'h:mm am/pm' => :date,
       'h:mm:ss am/pm' => :date,
     }
@@ -44,11 +54,11 @@
       format = format.to_s.downcase
       if type = EXCEPTIONAL_FORMATS[format]
         type
       elsif format.include?('#')
         :float
-      elsif format.include?('d') || format.include?('y')
+      elsif !format.match(/d+(?![\]])/).nil? || format.include?('y')
         if format.include?('h') || format.include?('s')
           :datetime
         else
           :date
         end
@@ -62,144 +72,318 @@
     end
 
     module_function :to_type
   end
 
-  # initialization and opening of a spreadsheet file
-  # values for packed: :zip
-  def initialize(filename, options = {}, deprecated_file_warning = :error)
-    if Hash === options
-      packed = options[:packed]
-      file_warning = options[:file_warning] || :error
-    else
-      warn 'Supplying `packed` or `file_warning` as separate arguments to `Roo::Excelx.new` is deprecated. Use an options hash instead.'
-      packed = options
-      file_warning = deprecated_file_warning
+  class Cell
+    attr_reader :type, :formula, :value, :excelx_type, :excelx_value, :style, :hyperlink, :coordinate
+    attr_writer :value
+
+    def initialize(value, type, formula, excelx_type, excelx_value, style, hyperlink, base_date, coordinate)
+      @type = type
+      @formula = formula
+      @base_date = base_date if [:date, :datetime].include?(@type)
+      @excelx_type = excelx_type
+      @excelx_value = excelx_value
+      @style = style
+      @value = type_cast_value(value)
+      @value = Roo::Link.new(hyperlink, @value.to_s) if hyperlink
+      @coordinate = coordinate
     end
 
-    file_type_check(filename,'.xlsx','an Excel-xlsx', file_warning, packed)
-    make_tmpdir do |tmpdir|
-      filename = download_uri(filename, tmpdir) if uri?(filename)
-      filename = unzip(filename, tmpdir) if packed == :zip
-      @filename = filename
-      unless File.file?(@filename)
-        raise IOError, "file #{@filename} does not exist"
+    def type
+      if @formula
+        :formula
+      elsif @value.is_a?(Roo::Link)
+        :link
+      else
+        @type
       end
-      @comments_files = Array.new
-      @rels_files = Array.new
-      extract_content(tmpdir, @filename)
-      @workbook_doc = load_xml(File.join(tmpdir, "roo_workbook.xml"))
-      @shared_table = []
-      if File.exist?(File.join(tmpdir, 'roo_sharedStrings.xml'))
-        @sharedstring_doc = load_xml(File.join(tmpdir, 'roo_sharedStrings.xml'))
-        read_shared_strings(@sharedstring_doc)
+    end
+
+    class Coordinate
+      attr_accessor :row, :column
+
+      def initialize(row, column)
+        @row, @column = row, column
       end
-      @styles_table = []
-      @style_definitions = Array.new # TODO: ??? { |h,k| h[k] = {} }
-      if File.exist?(File.join(tmpdir, 'roo_styles.xml'))
-        @styles_doc = load_xml(File.join(tmpdir, 'roo_styles.xml'))
-        read_styles(@styles_doc)
+    end
+
+    private
+
+    def type_cast_value(value)
+      case @type
+      when :float, :percentage
+        value.to_f
+      when :date
+        yyyy,mm,dd = (@base_date+value.to_i).strftime("%Y-%m-%d").split('-')
+        Date.new(yyyy.to_i,mm.to_i,dd.to_i)
+      when :datetime
+        create_datetime_from((@base_date+value.to_f.round(6)).strftime("%Y-%m-%d %H:%M:%S.%N"))
+      when :time
+        value.to_f*(24*60*60)
+      when :string
+        value
+      else
+        value
       end
-      @sheet_doc = load_xmls(@sheet_files)
-      @comments_doc = load_xmls(@comments_files)
-      @rels_doc = load_xmls(@rels_files)
     end
-    super(filename, options)
-    @formula = Hash.new
-    @excelx_type = Hash.new
-    @excelx_value = Hash.new
-    @s_attribute = Hash.new # TODO: ggf. wieder entfernen nur lokal benoetigt
-    @comment = Hash.new
-    @comments_read = Hash.new
-    @hyperlink = Hash.new
-    @hyperlinks_read = Hash.new
+
+    def create_datetime_from(datetime_string)
+      date_part,time_part = round_time_from(datetime_string).split(' ')
+      yyyy,mm,dd = date_part.split('-')
+      hh,mi,ss = time_part.split(':')
+      DateTime.civil(yyyy.to_i,mm.to_i,dd.to_i,hh.to_i,mi.to_i,ss.to_i)
+    end
+
+    def round_time_from(datetime_string)
+      date_part,time_part = datetime_string.split(' ')
+      yyyy,mm,dd = date_part.split('-')
+      hh,mi,ss = time_part.split(':')
+      Time.new(yyyy.to_i, mm.to_i, dd.to_i, hh.to_i, mi.to_i, ss.to_r).round(0).strftime("%Y-%m-%d %H:%M:%S")
+    end
   end
 
-  def method_missing(m,*args)
-    # is method name a label name
-    read_labels
-    if @label.has_key?(m.to_s)
-      sheet ||= @default_sheet
-      read_cells(sheet)
-      row,col = label(m.to_s)
-      cell(row,col)
+  class Sheet
+    def initialize(name, rels_path, sheet_path, comments_path, styles, shared_strings, workbook, options = {})
+      @name = name
+      @rels = Relationships.new(rels_path)
+      @comments = Comments.new(comments_path)
+      @styles = styles
+      @sheet = SheetDoc.new(sheet_path, @rels, @styles, shared_strings, workbook, options)
+    end
+
+    def cells
+      @cells ||= @sheet.cells(@rels)
+    end
+
+    def present_cells
+      @present_cells ||= cells.select {|key, cell| cell && cell.value }
+    end
+
+    # Yield each row as array of Excelx::Cell objects
+    # accepts options max_rows (int) (offset by 1 for header)
+    # and pad_cells (boolean)
+    def each_row(options = {}, &block)
+      row_count = 0
+      @sheet.each_row_streaming do |row|
+        break if options[:max_rows] && row_count == options[:max_rows] + 1
+        block.call(cells_for_row_element(row, options)) if block_given?
+        row_count += 1
+      end
+    end
+
+    def row(row_number)
+      first_column.upto(last_column).map do |col|
+        cells[[row_number,col]]
+      end.map {|cell| cell && cell.value }
+    end
+
+    def column(col_number)
+      first_row.upto(last_row).map do |row|
+        cells[[row,col_number]]
+      end.map {|cell| cell && cell.value }
+    end
+
+    # returns the number of the first non-empty row
+    def first_row
+      @first_row ||= present_cells.keys.map {|row, _| row }.min
+    end
+
+    def last_row
+      @last_row ||= present_cells.keys.map {|row, _| row }.max
+    end
+
+    # returns the number of the first non-empty column
+    def first_column
+      @first_column ||= present_cells.keys.map {|_, col| col }.min
+    end
+
+    # returns the number of the last non-empty column
+    def last_column
+      @last_column ||= present_cells.keys.map {|_, col| col }.max
+    end
+
+    def excelx_format(key)
+      cell = cells[key]
+      @styles.style_format(cell.style).to_s if cell
+    end
+
+    def hyperlinks
+      @hyperlinks ||= @sheet.hyperlinks(@rels)
+    end
+
+    def comments
+      @comments.comments
+    end
+
+    def dimensions
+      @sheet.dimensions
+    end
+
+    private
+
+    # Take an xml row and return an array of Excelx::Cell objects
+    # optionally pad array to header width(assumed 1st row).
+    # takes option pad_cells (boolean) defaults false
+    def cells_for_row_element(row_element, options = {})
+      return [] unless row_element
+      cell_col = 0
+      cells = []
+      @sheet.each_cell(row_element) do |cell|
+        cells.concat(pad_cells(cell, cell_col)) if options[:pad_cells]
+        cells << cell
+        cell_col = cell.coordinate.column
+      end
+      cells
+    end
+
+    def pad_cells(cell, last_column)
+      pad = []
+      (cell.coordinate.column - 1 - last_column).times { pad << nil }
+      pad
+    end
+  end
+
+  ExceedsMaxError = Class.new(StandardError)
+
+  # initialization and opening of a spreadsheet file
+  # values for packed: :zip
+  # optional cell_max (int) parameter for early aborting attempts to parse
+  # enormous documents.
+  def initialize(filename, options = {})
+    packed = options[:packed]
+    file_warning = options.fetch(:file_warning, :error)
+    cell_max = options.delete(:cell_max)
+    sheet_options = {}
+    sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false)
+
+    file_type_check(filename,'.xlsx','an Excel-xlsx', file_warning, packed)
+
+    @tmpdir = make_tmpdir(filename.split('/').last, options[:tmpdir_root])
+    @filename = local_filename(filename, @tmpdir, packed)
+    @comments_files = []
+    @rels_files = []
+    process_zipfile(@tmpdir, @filename)
+
+    @sheet_names = workbook.sheets.map do |sheet|
+      unless options[:only_visible_sheets] && sheet['state'] == 'hidden'
+        sheet['name']
+      end
+    end.compact
+    @sheets = []
+    @sheets_by_name = Hash[@sheet_names.map.with_index do |sheet_name, n|
+      @sheets[n] = Sheet.new(sheet_name, @rels_files[n], @sheet_files[n], @comments_files[n], styles, shared_strings, workbook, sheet_options)
+      [sheet_name, @sheets[n]]
+    end]
+
+    if cell_max
+      cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions)
+      raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max
+    end
+
+    super
+  end
+
+  def method_missing(method,*args)
+    if label = workbook.defined_names[method.to_s]
+      safe_send(sheet_for(label.sheet).cells[label.key], :value)
     else
       # call super for methods like #a1
       super
     end
   end
 
+  def sheets
+    @sheet_names
+  end
+
+  def sheet_for(sheet)
+    sheet ||= default_sheet
+    validate_sheet!(sheet)
+    @sheets_by_name[sheet]
+  end
+
   # Returns the content of a spreadsheet-cell.
   # (1,1) is the upper left corner.
   # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the
   # cell at the first line and first row.
   def cell(row, col, sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    row,col = normalize(row,col)
-    if celltype(row,col,sheet) == :date
-      yyyy,mm,dd = @cell[sheet][[row,col]].split('-')
-      return Date.new(yyyy.to_i,mm.to_i,dd.to_i)
-    elsif celltype(row,col,sheet) == :datetime
-      date_part,time_part = @cell[sheet][[row,col]].split(' ')
-      yyyy,mm,dd = date_part.split('-')
-      hh,mi,ss = time_part.split(':')
-      return DateTime.civil(yyyy.to_i,mm.to_i,dd.to_i,hh.to_i,mi.to_i,ss.to_i)
+    key = normalize(row,col)
+    safe_send(sheet_for(sheet).cells[key], :value)
+  end
+
+  def row(rownumber,sheet=nil)
+    sheet_for(sheet).row(rownumber)
+  end
+
+  # returns all values in this column as an array
+  # column numbers are 1,2,3,... like in the spreadsheet
+  def column(column_number,sheet=nil)
+    if column_number.is_a?(::String)
+      column_number = ::Roo::Utils.letter_to_number(column_number)
     end
-    @cell[sheet][[row,col]]
+    sheet_for(sheet).column(column_number)
   end
 
+  # returns the number of the first non-empty row
+  def first_row(sheet=nil)
+    sheet_for(sheet).first_row
+  end
+
+  # returns the number of the last non-empty row
+  def last_row(sheet=nil)
+    sheet_for(sheet).last_row
+  end
+
+  # returns the number of the first non-empty column
+  def first_column(sheet=nil)
+    sheet_for(sheet).first_column
+  end
+
+  # returns the number of the last non-empty column
+  def last_column(sheet=nil)
+    sheet_for(sheet).last_column
+  end
+
+  # set a cell to a certain value
+  # (this will not be saved back to the spreadsheet file!)
+  def set(row,col,value, sheet = nil) #:nodoc:
+    key = normalize(row,col)
+    cell_type = cell_type_by_value(value)
+    sheet_for(sheet).cells[key] = Cell.new(value, cell_type, nil, cell_type, value, nil, nil, nil, Cell::Coordinate.new(row, col))
+  end
+
+
   # Returns the formula at (row,col).
   # Returns nil if there is no formula.
   # The method #formula? checks if there is a formula.
   def formula(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    row,col = normalize(row,col)
-    @formula[sheet][[row,col]] && @formula[sheet][[row,col]]
+    key = normalize(row,col)
+    safe_send(sheet_for(sheet).cells[key], :formula)
   end
-  alias_method :formula?, :formula
 
-    # returns each formula in the selected sheet as an array of elements
-  # [row, col, formula]
-  def formulas(sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    if @formula[sheet]
-      @formula[sheet].each.collect do |elem|
-        [elem[0][0], elem[0][1], elem[1]]
-      end
-    else
-      []
-    end
+  # Predicate methods really should return a boolean
+  # value. Hopefully no one was relying on the fact that this
+  # previously returned either nil/formula
+  def formula?(*args)
+    !!formula(*args)
   end
 
-  class Font
-    attr_accessor :bold, :italic, :underline
-
-    def bold?
-      @bold == true
+  # returns each formula in the selected sheet as an array of tuples in following format
+  # [[row, col, formula], [row, col, formula],...]
+  def formulas(sheet=nil)
+    sheet_for(sheet).cells.select {|_, cell| cell.formula }.map do |(x, y), cell|
+      [x, y, cell.formula]
     end
-
-    def italic?
-      @italic == true
-    end
-
-    def underline?
-      @underline == true
-    end
   end
 
   # Given a cell, return the cell's style
   def font(row, col, sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    row,col = normalize(row,col)
-    s_attribute = @s_attribute[sheet][[row,col]]
-    s_attribute ||= 0
-    s_attribute = s_attribute.to_i
-    @style_definitions[s_attribute]
+    key = normalize(row,col)
+    definition_index = safe_send(sheet_for(sheet).cells[key], :style)
+    styles.definitions[definition_index] if definition_index
   end
 
   # returns the type of a cell:
   # * :float
   # * :string,
@@ -207,468 +391,252 @@
   # * :percentage
   # * :formula
   # * :time
   # * :datetime
   def celltype(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    row,col = normalize(row,col)
-    if @formula[sheet][[row,col]]
-      return :formula
-    else
-      @cell_type[sheet][[row,col]]
-    end
+    key = normalize(row, col)
+    safe_send(sheet_for(sheet).cells[key], :type)
   end
 
   # returns the internal type of an excel cell
   # * :numeric_or_formula
   # * :string
   # Note: this is only available within the Excelx class
   def excelx_type(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    row,col = normalize(row,col)
-    return @excelx_type[sheet][[row,col]]
+    key = normalize(row,col)
+    safe_send(sheet_for(sheet).cells[key], :excelx_type)
   end
 
   # returns the internal value of an excelx cell
   # Note: this is only available within the Excelx class
   def excelx_value(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    row,col = normalize(row,col)
-    return @excelx_value[sheet][[row,col]]
+    key = normalize(row,col)
+    safe_send(sheet_for(sheet).cells[key], :excelx_value)
   end
 
   # returns the internal format of an excel cell
   def excelx_format(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    row,col = normalize(row,col)
-    s = @s_attribute[sheet][[row,col]]
-    attribute2format(s).to_s
+    key = normalize(row,col)
+    sheet_for(sheet).excelx_format(key)
   end
 
-  # returns an array of sheet names in the spreadsheet
-  def sheets
-    @workbook_doc.xpath("//xmlns:sheet").map do |sheet|
-      sheet['name']
-    end
+  def empty?(row,col,sheet=nil)
+    sheet = sheet_for(sheet)
+    key = normalize(row,col)
+    cell = sheet.cells[key]
+    !cell || !cell.value || (cell.type == :string && cell.value.empty?) \
+      || (row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column)
   end
 
   # shows the internal representation of all cells
   # for debugging purposes
   def to_s(sheet=nil)
-    sheet ||= @default_sheet
-    read_cells(sheet)
-    @cell[sheet].inspect
+    sheet_for(sheet).cells.inspect
   end
 
   # returns the row,col values of the labelled cell
   # (nil,nil) if label is not defined
-  def label(labelname)
-    read_labels
-    if @label.empty? || !@label.has_key?(labelname)
-      return nil,nil,nil
+  def label(name)
+    labels = workbook.defined_names
+    if labels.empty? || !labels.key?(name)
+      [nil,nil,nil]
     else
-      return @label[labelname][1].to_i,
-        Roo::Base.letter_to_number(@label[labelname][2]),
-        @label[labelname][0]
+      [labels[name].row,
+        labels[name].col,
+        labels[name].sheet]
     end
   end
 
   # Returns an array which all labels. Each element is an array with
   # [labelname, [row,col,sheetname]]
   def labels
-    # sheet ||= @default_sheet
-    # read_cells(sheet)
-    read_labels
-    @label.map do |label|
-      [ label[0], # name
-        [ label[1][1].to_i, # row
-          Roo::Base.letter_to_number(label[1][2]), # column
-          label[1][0], # sheet
+    @labels ||= workbook.defined_names.map do |name, label|
+      [ name,
+        [ label.row,
+          label.col,
+          label.sheet,
         ] ]
     end
   end
 
   def hyperlink?(row,col,sheet=nil)
-    hyperlink(row, col, sheet) != nil
+    !!hyperlink(row, col, sheet)
   end
 
   # returns the hyperlink at (row/col)
   # nil if there is no hyperlink
   def hyperlink(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    read_hyperlinks(sheet) unless @hyperlinks_read[sheet]
-    row,col = normalize(row,col)
-    return nil unless @hyperlink[sheet]
-    @hyperlink[sheet][[row,col]]
+    key = normalize(row,col)
+    sheet_for(sheet).hyperlinks[key]
   end
 
   # returns the comment at (row/col)
   # nil if there is no comment
   def comment(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    #read_cells(sheet)
-    read_comments(sheet) unless @comments_read[sheet]
-    row,col = normalize(row,col)
-    return nil unless @comment[sheet]
-    @comment[sheet][[row,col]]
+    key = normalize(row,col)
+    sheet_for(sheet).comments[key]
   end
 
   # true, if there is a comment
   def comment?(row,col,sheet=nil)
-    sheet ||= @default_sheet
-    # read_cells(sheet)
-    read_comments(sheet) unless @comments_read[sheet]
-    row,col = normalize(row,col)
-    comment(row,col) != nil
+    !!comment(row,col,sheet)
   end
 
-  # returns each comment in the selected sheet as an array of elements
-  # [row, col, comment]
   def comments(sheet=nil)
-    sheet ||= @default_sheet
-    read_comments(sheet) unless @comments_read[sheet]
-    if @comment[sheet]
-      @comment[sheet].each.collect do |elem|
-        [elem[0][0],elem[0][1],elem[1]]
-      end
-    else
-      []
+    sheet_for(sheet).comments.map do |(x, y), comment|
+      [x, y, comment]
     end
   end
 
+  # Yield an array of Excelx::Cell
+  # Takes options for sheet, pad_cells, and max_rows
+  def each_row_streaming(options={})
+    sheet_for(options.delete(:sheet)).each_row(options) { |row| yield row }
+  end
+
   private
 
-  def load_xmls(paths)
-    paths.compact.map do |item|
-      load_xml(item)
+  def clean_sheet(sheet)
+    @sheets_by_name[sheet].cells.each_pair do |coord, value|
+      next unless value.value.is_a?(::String)
+
+      @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value)
     end
+
+    @cleaned[sheet] = true
   end
 
-  # helper function to set the internal representation of cells
-  def set_cell_values(sheet,x,y,i,v,value_type,formula,
-      excelx_type=nil,
-      excelx_value=nil,
-      s_attribute=nil)
-    key = [y,x+i]
-    @cell_type[sheet] ||= {}
-    @cell_type[sheet][key] = value_type
-    @formula[sheet] ||= {}
-    @formula[sheet][key] = formula  if formula
-    @cell[sheet] ||= {}
-    @cell[sheet][key] =
-      case @cell_type[sheet][key]
-      when :float
-        v.to_f
-      when :string
-        v
-      when :date
-        (base_date+v.to_i).strftime("%Y-%m-%d")
-      when :datetime
-        (base_date+v.to_f).strftime("%Y-%m-%d %H:%M:%S")
-      when :percentage
-        v.to_f
-      when :time
-        v.to_f*(24*60*60)
-      else
-        v
-      end
+  # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx
+  #           documents require a workbook.xml file, so a if the file is missing
+  #           it is not a valid xlsx file. In these cases, an ArgumentError is
+  #           raised.
+  #
+  # wb - a Zip::Entry for the workbook.xml file.
+  # path - A String for Zip::Entry's destination path.
+  #
+  # Examples
+  #
+  #   extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml')
+  #   # => ["rId1", "rId2", "rId3"]
+  #
+  # Returns an Array of Strings.
+  def extract_worksheet_ids(entries, path)
+      wb = entries.find { |e| e.name[/workbook.xml$/] }
+      fail ArgumentError 'missing required workbook file' if wb.nil?
 
-    @cell[sheet][key] = Spreadsheet::Link.new(@hyperlink[sheet][key], @cell[sheet][key].to_s) if hyperlink?(y,x+i)
-    @excelx_type[sheet] ||= {}
-    @excelx_type[sheet][key] = excelx_type
-    @excelx_value[sheet] ||= {}
-    @excelx_value[sheet][key] = excelx_value
-    @s_attribute[sheet] ||= {}
-    @s_attribute[sheet][key] = s_attribute
+      wb.extract(path)
+      workbook_doc = Roo::Utils.load_xml(path).remove_namespaces!
+      workbook_doc.xpath('//sheet').map{ |s| s.attributes['id'].value }
   end
 
-  # read all cells in the selected sheet
-  def read_cells(sheet=nil)
-    sheet ||= @default_sheet
-    validate_sheet!(sheet)
-    return if @cells_read[sheet]
+  # Internal
+  #
+  # wb_rels - A Zip::Entry for the workbook.xml.rels file.
+  # path - A String for the Zip::Entry's destination path.
+  #
+  # Examples
+  #
+  #   extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels')
+  #   # => {
+  #         "rId1"=>"worksheets/sheet1.xml",
+  #         "rId2"=>"worksheets/sheet2.xml",
+  #         "rId3"=>"worksheets/sheet3.xml"
+  #        }
+  #
+  # Returns a Hash.
+  def extract_worksheet_rels(entries, path)
+    wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] }
+    fail ArgumentError 'missing required workbook file' if wb_rels.nil?
 
-    @sheet_doc[sheets.index(sheet)].xpath("/xmlns:worksheet/xmlns:sheetData/xmlns:row/xmlns:c").each do |c|
-      s_attribute = c['s'].to_i   # should be here
-      # c: <c r="A5" s="2">
-      # <v>22606</v>
-      # </c>, format: , tmp_type: float
-      value_type =
-        case c['t']
-        when 's'
-          :shared
-        when 'b'
-          :boolean
-          # 2011-02-25 BEGIN
-        when 'str'
-          :string
-          # 2011-02-25 END
-          # 2011-09-15 BEGIN
-        when 'inlineStr'
-          :inlinestr
-          # 2011-09-15 END
-        else
-          format = attribute2format(s_attribute)
-          Format.to_type(format)
-        end
-      formula = nil
-      c.children.each do |cell|
-        case cell.name
-        when 'is'
-          cell.children.each do |is|
-            if is.name == 't'
-              inlinestr_content = is.content
-              value_type = :string
-              v = inlinestr_content
-              excelx_type = :string
-              y, x = Roo::Base.split_coordinate(c['r'])
-              excelx_value = inlinestr_content #cell.content
-              set_cell_values(sheet,x,y,0,v,value_type,formula,excelx_type,excelx_value,s_attribute)
-            end
-          end
-        when 'f'
-          formula = cell.content
-        when 'v'
-          if [:time, :datetime].include?(value_type) && cell.content.to_f >= 1.0
-            value_type =
-              if (cell.content.to_f - cell.content.to_f.floor).abs > 0.000001
-                :datetime
-              else
-                :date
-              end
-          end
-          excelx_type = [:numeric_or_formula,format.to_s]
-          excelx_value = cell.content
-          v =
-            case value_type
-            when :shared
-              value_type = :string
-              excelx_type = :string
-              @shared_table[cell.content.to_i]
-            when :boolean
-              (cell.content.to_i == 1 ? 'TRUE' : 'FALSE')
-            when :date
-              cell.content
-            when :time
-              cell.content
-            when :datetime
-              cell.content
-            when :formula
-              cell.content.to_f #TODO: !!!!
-            when :string
-              excelx_type = :string
-              cell.content
-            else
-              value_type = :float
-              cell.content
-            end
-          y, x = Roo::Base.split_coordinate(c['r'])
-          set_cell_values(sheet,x,y,0,v,value_type,formula,excelx_type,excelx_value,s_attribute)
-        end
-      end
+    wb_rels.extract(path)
+    rels_doc = Roo::Utils.load_xml(path).remove_namespaces!
+    worksheet_type ='http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet'
+
+    relationships = rels_doc.xpath('//Relationship').select do |relationship|
+      relationship.attributes['Type'].value == worksheet_type
     end
-    @cells_read[sheet] = true
-    # begin comments
-=begin
-Datei xl/comments1.xml
-  <?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
-  <comments xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
-    <authors>
-      <author />
-    </authors>
-    <commentList>
-      <comment ref="B4" authorId="0">
-        <text>
-          <r>
-            <rPr>
-              <sz val="10" />
-              <rFont val="Arial" />
-              <family val="2" />
-            </rPr>
-            <t>Kommentar fuer B4</t>
-          </r>
-        </text>
-      </comment>
-      <comment ref="B5" authorId="0">
-        <text>
-          <r>
-            <rPr>
-            <sz val="10" />
-            <rFont val="Arial" />
-            <family val="2" />
-          </rPr>
-          <t>Kommentar fuer B5</t>
-        </r>
-      </text>
-    </comment>
-  </commentList>
-  </comments>
-=end
-=begin
-    if @comments_doc[self.sheets.index(sheet)]
-      read_comments(sheet)
-    end
-=end
-    #end comments
-  end
 
-  # Reads all comments from a sheet
-  def read_comments(sheet=nil)
-    sheet ||= @default_sheet
-    validate_sheet!(sheet)
-    n = self.sheets.index(sheet)
-    return unless @comments_doc[n] #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    @comments_doc[n].xpath("//xmlns:comments/xmlns:commentList/xmlns:comment").each do |comment|
-      ref = comment.attributes['ref'].to_s
-      row,col = Roo::Base.split_coordinate(ref)
-      comment.xpath('./xmlns:text/xmlns:r/xmlns:t').each do |text|
-        @comment[sheet] ||= {}
-        @comment[sheet][[row,col]] = text.text
-      end
+    relationships.inject({}) do |hash, relationship|
+      attributes = relationship.attributes
+      id = attributes['Id'];
+      hash[id.value] = attributes['Target'].value
+      hash
     end
-    @comments_read[sheet] = true
   end
 
-  # Reads all hyperlinks from a sheet
-  def read_hyperlinks(sheet=nil)
-    sheet ||= @default_sheet
-    validate_sheet!(sheet)
-    n = self.sheets.index(sheet)
-    if rels_doc = @rels_doc[n]
-      rels = Hash[rels_doc.xpath("/xmlns:Relationships/xmlns:Relationship").map do |r|
-        [r.attribute('Id').text, r]
-      end]
-      @sheet_doc[n].xpath("/xmlns:worksheet/xmlns:hyperlinks/xmlns:hyperlink").each do |h|
-        if rel_element = rels[h.attribute('id').text]
-          row,col = Roo::Base.split_coordinate(h.attributes['ref'].to_s)
-          @hyperlink[sheet] ||= {}
-          @hyperlink[sheet][[row,col]] = rel_element.attribute('Target').text
-        end
-      end
+  def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
+    sheet_ids.each_with_index do |id, i|
+      name = sheets[id]
+      entry = entries.find { |entry| entry.name =~ /#{name}$/ }
+      path = "#{tmpdir}/roo_sheet#{i + 1}"
+      @sheet_files << path
+      entry.extract(path)
     end
-    @hyperlinks_read[sheet] = true
   end
 
-  def read_labels
-    @label ||= Hash[@workbook_doc.xpath("//xmlns:definedName").map do |defined_name|
-      # "Sheet1!$C$5"
-      sheet, coordinates = defined_name.text.split('!$', 2)
-      col,row = coordinates.split('$')
-      [defined_name['name'], [sheet,row,col]]
-    end]
-  end
-
   # Extracts all needed files from the zip file
-  def process_zipfile(tmpdir, zipfilename, zip, path='')
+  def process_zipfile(tmpdir, zipfilename)
     @sheet_files = []
-    Roo::ZipFile.open(zipfilename) {|zf|
-      zf.entries.each {|entry|
-        entry_name = entry.to_s.downcase
+    entries = Zip::File.open(zipfilename).to_a.sort_by(&:name)
 
-        path =
-          if entry_name.end_with?('workbook.xml')
-            "#{tmpdir}/roo_workbook.xml"
-          elsif entry_name.end_with?('sharedstrings.xml')
-            "#{tmpdir}/roo_sharedStrings.xml"
-          elsif entry_name.end_with?('styles.xml')
-            "#{tmpdir}/roo_styles.xml"
-          elsif entry_name =~ /sheet([0-9]+).xml$/
-            nr = $1
-            @sheet_files[nr.to_i-1] = "#{tmpdir}/roo_sheet#{nr}"
-          elsif entry_name =~ /comments([0-9]+).xml$/
-            nr = $1
-            @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}"
-          elsif entry_name =~ /sheet([0-9]+).xml.rels$/
-            nr = $1
-            @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}"
-          end
-        if path
-          extract_file(zip, entry, path)
-        end
-      }
-    }
-  end
+    # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames
+    #       are not in order. With Numbers 3.1, the first sheet is always
+    #       sheet.xml, not sheet1.xml. With Google, the order of the worksheets is
+    #       independent of a worksheet's filename (i.e. sheet6.xml can be the
+    #       first worksheet).
+    #
+    #       workbook.xml lists the correct order of worksheets and
+    #       workbook.xml.rels lists the filenames for those worksheets.
+    #
+    #       workbook.xml:
+    #         <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/>
+    #         <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/>
+    #       workbook.xml.rel:
+    #         <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+    #         <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+    sheet_ids = extract_worksheet_ids(entries, "#{tmpdir}/roo_workbook.xml")
+    sheets = extract_worksheet_rels(entries, "#{tmpdir}/roo_workbook.xml.rels")
+    extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
 
-  def extract_file(source_zip, entry, destination_path)
-    open(destination_path,'wb') {|f|
-      f << source_zip.read(entry)
-    }
-  end
-
-  # extract files from the zip file
-  def extract_content(tmpdir, zipfilename)
-    Roo::ZipFile.open(@filename) do |zip|
-      process_zipfile(tmpdir, zipfilename,zip)
-    end
-  end
-
-  # read the shared strings xml document
-  def read_shared_strings(doc)
-    doc.xpath("/xmlns:sst/xmlns:si").each do |si|
-      shared_table_entry = ''
-      si.children.each do |elem|
-        if elem.name == 'r' and elem.children
-          elem.children.each do |r_elem|
-            if r_elem.name == 't'
-              shared_table_entry << r_elem.content
-            end
-          end
-        end
-        if elem.name == 't'
-          shared_table_entry = elem.content
-        end
+    entries.each do |entry|
+      path =
+      case entry.name.downcase
+      when /sharedstrings.xml$/
+        "#{tmpdir}/roo_sharedStrings.xml"
+      when /styles.xml$/
+        "#{tmpdir}/roo_styles.xml"
+      when /comments([0-9]+).xml$/
+        # FIXME: Most of the time, The order of the comment files are the same
+        #       the sheet order, i.e. sheet1.xml's comments are in comments1.xml.
+        #       In some situations, this isn't true. The true location of a
+        #       sheet's comment file is in the sheet1.xml.rels file. SEE
+        #       ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
+        nr = Regexp.last_match[1].to_i
+        @comments_files[nr - 1] = "#{tmpdir}/roo_comments#{nr}"
+      when /sheet([0-9]+).xml.rels$/
+        # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
+        #        it also stores the location for sharedStrings, comments,
+        #        drawings, etc.
+        nr = Regexp.last_match[1].to_i
+        @rels_files[nr - 1] = "#{tmpdir}/roo_rels#{nr}"
       end
-      @shared_table << shared_table_entry
-    end
-  end
 
-  # read the styles elements of an excelx document
-  def read_styles(doc)
-    @cellXfs = []
-
-    @numFmts = Hash[doc.xpath("//xmlns:numFmt").map do |numFmt|
-      [numFmt['numFmtId'], numFmt['formatCode']]
-    end]
-    fonts = doc.xpath("//xmlns:fonts/xmlns:font").map do |font_el|
-      Font.new.tap do |font|
-        font.bold = !font_el.xpath('./xmlns:b').empty?
-        font.italic = !font_el.xpath('./xmlns:i').empty?
-        font.underline = !font_el.xpath('./xmlns:u').empty?
-      end
+      entry.extract(path) if path
     end
-
-    doc.xpath("//xmlns:cellXfs").each do |xfs|
-      xfs.children.each do |xf|
-        @cellXfs << xf['numFmtId']
-        @style_definitions << fonts[xf['fontId'].to_i]
-      end
-    end
   end
 
-  # convert internal excelx attribute to a format
-  def attribute2format(s)
-    id = @cellXfs[s.to_i]
-    @numFmts[id] || Format::STANDARD_FORMATS[id.to_i]
+  def styles
+    @styles ||= Styles.new(File.join(@tmpdir, 'roo_styles.xml'))
   end
 
-  def base_date
-    @base_date ||= read_base_date
+  def shared_strings
+    @shared_strings ||= SharedStrings.new(File.join(@tmpdir, 'roo_sharedStrings.xml'))
   end
 
-  # Default to 1900 (minus one day due to excel quirk) but use 1904 if
-  # it's set in the Workbook's workbookPr
-  # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
-  def read_base_date
-    base_date = Date.new(1899,12,30)
-    @workbook_doc.xpath("//xmlns:workbookPr").map do |workbookPr|
-      if workbookPr["date1904"] && workbookPr["date1904"] =~ /true|1/i
-        base_date = Date.new(1904,01,01)
-      end
-    end
-    base_date
+  def workbook
+    @workbook ||= Workbook.new(File.join(@tmpdir, "roo_workbook.xml"))
   end
 
-end # class
+  def safe_send(object, method, *args)
+    object.send(method, *args) if object && object.respond_to?(method)
+  end
+end