# Copyright (C) 2019-2022 Sutou Kouhei <kou@clear-code.com> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA require "chupa-text/decomposers/office-open-xml" module ChupaText module Decomposers class OfficeOpenXMLWorkbook < OfficeOpenXML registry.register("office-open-xml-workbook", self) def initialize(options={}) super @extensions = [ "xlsx", "xlsm", "xltx", "xltm", ] @mime_types = [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel.sheet.macroEnabled.12", "application/vnd.openxmlformats-officedocument.spreadsheetml.template", "application/vnd.ms-excel.template.macroEnabled.12", ] @namespace_uri = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" end private def start_decompose(context) context[:shared_strings] = [] context[:sheet_names] = [] context[:sheets] = [] end def process_entry(entry, context) case entry.zip_path when "xl/sharedStrings.xml" extract_text(entry, context[:shared_strings]) when "xl/workbook.xml" listener = WorkbookListener.new(context[:sheet_names]) parse(entry.file_data, listener) when /\Axl\/worksheets\/sheet(\d+)\.xml\z/ nth_sheet = Integer($1, 10) sheet = [] listener = SheetListener.new(sheet) parse(entry.file_data, listener) context[:sheets] << [nth_sheet, sheet] end end def finish_decompose(context, &block) metadata = TextData.new("", source_data: context[:data]) context[:attributes].each do |name, value| metadata[name] = value end yield(metadata) shared_strings = context[:shared_strings] sheets = context[:sheets].sort_by(&:first).collect(&:last) sheet_names = context[:sheet_names] sheets.each_with_index do |sheet, i| sheet_text = "" sheet.each do |row| row_texts = row.collect do |cell| case cell when Integer shared_strings[cell] else cell end end sheet_text << row_texts.join("\t") << "\n" end text_data = TextData.new(sheet_text, source_data: context[:data]) text_data["index"] = i name = sheet_names[i] text_data["name"] = name if name yield(text_data) end end def log_tag "#{super}[workbook]" end class WorkbookListener < SAXListener URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" def initialize(sheet_names) @sheet_names = sheet_names end def start_element(uri, local_name, qname, attributes) return unless uri == URI case local_name when "sheet" @sheet_names << attributes["name"] end end end class SheetListener < SAXListener URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" def initialize(sheet) @sheet = sheet @cell_type = nil @in_is = false # inline string @in_v = false # value end def start_element(uri, local_name, qname, attributes) return unless uri == URI case local_name when "row" @sheet << [] when "c" @cell_type = parse_cell_type(attributes["t"]) when "is" @in_is = true when "v" @in_v = true end end def end_element(uri, local_name, qname) return unless uri == URI case local_name when "c" @cell_type = nil when "is" @in_is = false when "v" @in_v = false end end def characters(text) add_column(text) end def cdata(content) add_column(content) end private # https://c-rex.net/projects/samples/ooxml/e1/Part4/OOXML_P4_DOCX_ST_CellType_topic_ID0E6NEFB.html def parse_cell_type(type) case type when "b" :boolean when "e" :error when "inlineStr" :inline_string when "n" :number when "s" :shared_string when "str" :string else nil end end def have_text? return true if @in_is return true if @in_v false end def add_column(text) return unless have_text? case @cell_type when :shared_string @sheet.last << Integer(text, 10) else @sheet.last << text end end end end end end