################################################################################ # # Copyright (C) 2008 James Healy (jimmy@deefa.com) # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ################################################################################ require 'enumerator' class PDF::Reader class Encoding UNKNOWN_CHAR = 0x25AF # ▯ attr_reader :differences def initialize(enc) if enc.kind_of?(Hash) self.differences=enc[:Differences] if enc[:Differences] enc = enc[:Encoding] || enc[:BaseEncoding] elsif enc != nil enc = enc.to_sym end case enc when nil then load_mapping File.dirname(__FILE__) + "/encodings/standard.txt" @unpack = "C*" when "Identity-H".to_sym then @unpack = "n*" @to_unicode_required = true when :MacRomanEncoding then load_mapping File.dirname(__FILE__) + "/encodings/mac_roman.txt" @unpack = "C*" when :MacExpertEncoding then load_mapping File.dirname(__FILE__) + "/encodings/mac_expert.txt" @unpack = "C*" when :PDFDocEncoding then load_mapping File.dirname(__FILE__) + "/encodings/pdf_doc.txt" @unpack = "C*" when :StandardEncoding then load_mapping File.dirname(__FILE__) + "/encodings/standard.txt" @unpack = "C*" when :SymbolEncoding then load_mapping File.dirname(__FILE__) + "/encodings/symbol.txt" @unpack = "C*" when :UTF16Encoding then @unpack = "n*" when :WinAnsiEncoding then load_mapping File.dirname(__FILE__) + "/encodings/win_ansi.txt" @unpack = "C*" when :ZapfDingbatsEncoding then load_mapping File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt" @unpack = "C*" else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding" end end # set the differences table for this encoding. should be an array in the following format: # # [25, :A, 26, :B] # # The array alternates bewteen a decimal byte number and a glyph name to map to that byte # # To save space the following array is also valid and equivilant to the previous one # # [25, :A, :B] def differences=(diff) raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array) @differences = {} byte = 0 diff.each do |val| if val.kind_of?(Numeric) byte = val.to_i else @differences[byte] = val byte += 1 end end @differences end # convert the specified string to utf8 def to_utf8(str, tounicode = nil) # unpack the single bytes array_orig = str.unpack(@unpack) # replace any relevant bytes with a glyph name array_orig = process_differences(array_orig) # replace any remaining bytes with a unicode codepoint array_enc = [] array_orig.each do |num| if tounicode && (code = tounicode.decode(num)) array_enc << code elsif tounicode || ( tounicode.nil? && defined?(@to_unicode_required) && @to_unicode_required ) array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR elsif defined?(@mapping) && @mapping && @mapping[num] array_enc << @mapping[num] else array_enc << num end end # convert any glyph names to unicode codepoints array_enc = process_glyphnames(array_enc) # replace charcters that didn't convert to unicode nicely with something valid array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR } # pack all our Unicode codepoints into a UTF-8 string ret = array_enc.pack("U*") # set the strings encoding correctly under ruby 1.9+ ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding) return ret end private # accepts an array of byte numbers, and replaces any that have entries in the differences table # with a glyph name def process_differences(arr) @differences ||= {} arr.collect! { |n| @differences[n].nil? ? n : @differences[n]} end # accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints def process_glyphnames(arr) @differences ||= {} arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]} end def load_mapping(file) @mapping = {} RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r" File.open(file, mode) do |f| f.each do |l| m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/) @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte end end end end end