# # An Encoding instance represents a character encoding usable in Ruby. It is # defined as a constant under the Encoding namespace. It has a name and, # optionally, aliases: # # Encoding::US_ASCII.name # => "US-ASCII" # Encoding::US_ASCII.names # => ["US-ASCII", "ASCII", "ANSI_X3.4-1968", "646"] # # A Ruby method that accepts an encoding as an argument will accept: # # * An Encoding object. # * The name of an encoding. # * An alias for an encoding name. # # # These are equivalent: # # 'foo'.encode(Encoding::US_ASCII) # Encoding object. # 'foo'.encode('US-ASCII') # Encoding name. # 'foo'.encode('ASCII') # Encoding alias. # # For a full discussion of encodings and their uses, see [the Encodings # document](rdoc-ref:encodings.rdoc). # # Encoding::ASCII_8BIT is a special-purpose encoding that is usually used for a # string of bytes, not a string of characters. But as the name indicates, its # characters in the ASCII range are considered as ASCII characters. This is # useful when you use other ASCII-compatible encodings. # class Encoding def self._load: [T] (T) -> T # # Returns the locale charmap name. It returns nil if no appropriate information. # # Debian GNU/Linux # LANG=C # Encoding.locale_charmap #=> "ANSI_X3.4-1968" # LANG=ja_JP.EUC-JP # Encoding.locale_charmap #=> "EUC-JP" # # SunOS 5 # LANG=C # Encoding.locale_charmap #=> "646" # LANG=ja # Encoding.locale_charmap #=> "eucJP" # # The result is highly platform dependent. So # Encoding.find(Encoding.locale_charmap) may cause an error. If you need some # encoding object even for unknown locale, Encoding.find("locale") can be used. # def self.locale_charmap: () -> String # # Returns the hash of available encoding alias and original encoding name. # # Encoding.aliases # #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII", # "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"} # def self.aliases: () -> Hash[String, String] # # Checks the compatibility of two objects. # # If the objects are both strings they are compatible when they are # concatenatable. The encoding of the concatenated string will be returned if # they are compatible, nil if they are not. # # Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b") # #=> # # # Encoding.compatible?( # "\xa1".force_encoding("iso-8859-1"), # "\xa1\xa1".force_encoding("euc-jp")) # #=> nil # # If the objects are non-strings their encodings are compatible when they have # an encoding and: # * Either encoding is US-ASCII compatible # * One of the encodings is a 7-bit encoding # def self.compatible?: (untyped obj1, untyped obj2) -> Encoding? # # Returns default external encoding. # # The default external encoding is used by default for strings created from the # following locations: # # * CSV # * File data read from disk # * SDBM # * StringIO # * Zlib::GzipReader # * Zlib::GzipWriter # * String#inspect # * Regexp#inspect # # # While strings created from these locations will have this encoding, the # encoding may not be valid. Be sure to check String#valid_encoding?. # # File data written to disk will be transcoded to the default external encoding # when written, if default_internal is not nil. # # The default external encoding is initialized by the -E option. If -E isn't # set, it is initialized to UTF-8 on Windows and the locale on other operating # systems. # def self.default_external: () -> Encoding # # Sets default external encoding. You should not set Encoding::default_external # in ruby code as strings created before changing the value may have a different # encoding from strings created after the value was changed., instead you should # use `ruby -E` to invoke ruby with the correct default_external. # # See Encoding::default_external for information on how the default external # encoding is used. # def self.default_external=: (Encoding enc) -> Encoding | [T < _ToStr] (T enc) -> T # # Returns default internal encoding. Strings will be transcoded to the default # internal encoding in the following places if the default internal encoding is # not nil: # # * CSV # * Etc.sysconfdir and Etc.systmpdir # * File data read from disk # * File names from Dir # * Integer#chr # * String#inspect and Regexp#inspect # * Strings returned from Readline # * Strings returned from SDBM # * Time#zone # * Values from ENV # * Values in ARGV including $PROGRAM_NAME # # # Additionally String#encode and String#encode! use the default internal # encoding if no encoding is given. # # The script encoding (__ENCODING__), not default_internal, is used as the # encoding of created strings. # # Encoding::default_internal is initialized with -E option or nil otherwise. # def self.default_internal: () -> Encoding? # # Sets default internal encoding or removes default internal encoding when # passed nil. You should not set Encoding::default_internal in ruby code as # strings created before changing the value may have a different encoding from # strings created after the change. Instead you should use `ruby -E` to invoke # ruby with the correct default_internal. # # See Encoding::default_internal for information on how the default internal # encoding is used. # def self.default_internal=: (Encoding enc) -> Encoding | [T < _ToStr] (T enc) -> T | (nil) -> nil # # Search the encoding with specified *name*. *name* should be a string. # # Encoding.find("US-ASCII") #=> # # # Names which this method accept are encoding names and aliases including # following special aliases # # "external" # : default external encoding # "internal" # : default internal encoding # "locale" # : locale encoding # "filesystem" # : filesystem encoding # # # An ArgumentError is raised when no encoding with *name*. Only # `Encoding.find("internal")` however returns nil when no encoding named # "internal", in other words, when Ruby has no default internal encoding. # def self.find: (encoding enc) -> Encoding? # # Returns the list of loaded encodings. # # Encoding.list # #=> [#, #, # #] # # Encoding.find("US-ASCII") # #=> # # # Encoding.list # #=> [#, #, # #, #] # def self.list: () -> Array[Encoding] # # Returns the list of available encoding names. # # Encoding.name_list # #=> ["US-ASCII", "ASCII-8BIT", "UTF-8", # "ISO-8859-1", "Shift_JIS", "EUC-JP", # "Windows-31J", # "BINARY", "CP932", "eucJP"] # def self.name_list: () -> Array[String] # # Returns whether ASCII-compatible or not. # # Encoding::UTF_8.ascii_compatible? #=> true # Encoding::UTF_16BE.ascii_compatible? #=> false # def ascii_compatible?: () -> bool # # Returns true for dummy encodings. A dummy encoding is an encoding for which # character handling is not properly implemented. It is used for stateful # encodings. # # Encoding::ISO_2022_JP.dummy? #=> true # Encoding::UTF_8.dummy? #=> false # def dummy?: () -> bool # # Returns a string which represents the encoding for programmers. # # Encoding::UTF_8.inspect #=> "#" # Encoding::ISO_2022_JP.inspect #=> "#" # def inspect: () -> String # # Returns the name of the encoding. # # Encoding::UTF_8.name #=> "UTF-8" # def name: () -> String # # Returns the list of name and aliases of the encoding. # # Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"] # def names: () -> Array[String] # # Returns the name of the encoding. # # Encoding::UTF_8.name #=> "UTF-8" # alias to_s name ANSI_X3_4_1968: Encoding ASCII: Encoding ASCII_8BIT: Encoding BIG5: Encoding BIG5_HKSCS: Encoding BIG5_HKSCS_2008: Encoding BIG5_UAO: Encoding BINARY: Encoding Big5: Encoding Big5_HKSCS: Encoding Big5_HKSCS_2008: Encoding Big5_UAO: Encoding CESU_8: Encoding CP1250: Encoding CP1251: Encoding CP1252: Encoding CP1253: Encoding CP1254: Encoding CP1255: Encoding CP1256: Encoding CP1257: Encoding CP1258: Encoding CP437: Encoding CP50220: Encoding CP50221: Encoding CP51932: Encoding CP65000: Encoding CP65001: Encoding CP737: Encoding CP775: Encoding CP850: Encoding CP852: Encoding CP855: Encoding CP857: Encoding CP860: Encoding CP861: Encoding CP862: Encoding CP863: Encoding CP864: Encoding CP865: Encoding CP866: Encoding CP869: Encoding CP874: Encoding CP878: Encoding CP932: Encoding CP936: Encoding CP949: Encoding CP950: Encoding CP951: Encoding CSWINDOWS31J: Encoding CsWindows31J: Encoding EBCDIC_CP_US: Encoding EMACS_MULE: Encoding EUCCN: Encoding EUCJP: Encoding EUCJP_MS: Encoding EUCKR: Encoding EUCTW: Encoding EUC_CN: Encoding EUC_JISX0213: Encoding EUC_JIS_2004: Encoding EUC_JP: Encoding EUC_JP_MS: Encoding EUC_KR: Encoding EUC_TW: Encoding Emacs_Mule: Encoding EucCN: Encoding EucJP: Encoding EucJP_ms: Encoding EucKR: Encoding EucTW: Encoding GB12345: Encoding GB18030: Encoding GB1988: Encoding GB2312: Encoding GBK: Encoding IBM037: Encoding IBM437: Encoding IBM737: Encoding IBM720: Encoding CP720: Encoding IBM775: Encoding IBM850: Encoding IBM852: Encoding IBM855: Encoding IBM857: Encoding IBM860: Encoding IBM861: Encoding IBM862: Encoding IBM863: Encoding IBM864: Encoding IBM865: Encoding IBM866: Encoding IBM869: Encoding ISO2022_JP: Encoding ISO2022_JP2: Encoding ISO8859_1: Encoding ISO8859_10: Encoding ISO8859_11: Encoding ISO8859_13: Encoding ISO8859_14: Encoding ISO8859_15: Encoding ISO8859_16: Encoding ISO8859_2: Encoding ISO8859_3: Encoding ISO8859_4: Encoding ISO8859_5: Encoding ISO8859_6: Encoding ISO8859_7: Encoding ISO8859_8: Encoding ISO8859_9: Encoding ISO_2022_JP: Encoding ISO_2022_JP_2: Encoding ISO_2022_JP_KDDI: Encoding ISO_8859_1: Encoding ISO_8859_10: Encoding ISO_8859_11: Encoding ISO_8859_13: Encoding ISO_8859_14: Encoding ISO_8859_15: Encoding ISO_8859_16: Encoding ISO_8859_2: Encoding ISO_8859_3: Encoding ISO_8859_4: Encoding ISO_8859_5: Encoding ISO_8859_6: Encoding ISO_8859_7: Encoding ISO_8859_8: Encoding ISO_8859_9: Encoding KOI8_R: Encoding KOI8_U: Encoding MACCENTEURO: Encoding MACCROATIAN: Encoding MACCYRILLIC: Encoding MACGREEK: Encoding MACICELAND: Encoding MACJAPAN: Encoding MACJAPANESE: Encoding MACROMAN: Encoding MACROMANIA: Encoding MACTHAI: Encoding MACTURKISH: Encoding MACUKRAINE: Encoding MacCentEuro: Encoding MacCroatian: Encoding MacCyrillic: Encoding MacGreek: Encoding MacIceland: Encoding MacJapan: Encoding MacJapanese: Encoding MacRoman: Encoding MacRomania: Encoding MacThai: Encoding MacTurkish: Encoding MacUkraine: Encoding PCK: Encoding SHIFT_JIS: Encoding SJIS: Encoding SJIS_DOCOMO: Encoding SJIS_DoCoMo: Encoding SJIS_KDDI: Encoding SJIS_SOFTBANK: Encoding SJIS_SoftBank: Encoding STATELESS_ISO_2022_JP: Encoding STATELESS_ISO_2022_JP_KDDI: Encoding Shift_JIS: Encoding Stateless_ISO_2022_JP: Encoding Stateless_ISO_2022_JP_KDDI: Encoding TIS_620: Encoding UCS_2BE: Encoding UCS_4BE: Encoding UCS_4LE: Encoding US_ASCII: Encoding UTF8_DOCOMO: Encoding UTF8_DoCoMo: Encoding UTF8_KDDI: Encoding UTF8_MAC: Encoding UTF8_SOFTBANK: Encoding UTF8_SoftBank: Encoding UTF_16: Encoding UTF_16BE: Encoding UTF_16LE: Encoding UTF_32: Encoding UTF_32BE: Encoding UTF_32LE: Encoding UTF_7: Encoding UTF_8: Encoding UTF_8_HFS: Encoding UTF_8_MAC: Encoding WINDOWS_1250: Encoding WINDOWS_1251: Encoding WINDOWS_1252: Encoding WINDOWS_1253: Encoding WINDOWS_1254: Encoding WINDOWS_1255: Encoding WINDOWS_1256: Encoding WINDOWS_1257: Encoding WINDOWS_1258: Encoding WINDOWS_31J: Encoding WINDOWS_874: Encoding Windows_1250: Encoding Windows_1251: Encoding Windows_1252: Encoding Windows_1253: Encoding Windows_1254: Encoding Windows_1255: Encoding Windows_1256: Encoding Windows_1257: Encoding Windows_1258: Encoding Windows_31J: Encoding Windows_874: Encoding # # Raised by Encoding and String methods when the source encoding is incompatible # with the target encoding. # class CompatibilityError < EncodingError end # # Raised by transcoding methods when a named encoding does not correspond with a # known converter. # class ConverterNotFoundError < EncodingError end # # Raised by Encoding and String methods when the string being transcoded # contains a byte invalid for the either the source or target encoding. # class InvalidByteSequenceError < EncodingError # # Returns the destination encoding as an encoding object. # def destination_encoding: () -> Encoding # # Returns the destination encoding name as a string. # def destination_encoding_name: () -> String # # Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs. # # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") # begin # ec.convert("abc\xA1\xFFdef") # rescue Encoding::InvalidByteSequenceError # p $! #=> # # puts $!.error_bytes.dump #=> "\xA1" # puts $!.readagain_bytes.dump #=> "\xFF" # end # def error_bytes: () -> String # # Returns true if the invalid byte sequence error is caused by premature end of # string. # # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") # # begin # ec.convert("abc\xA1z") # rescue Encoding::InvalidByteSequenceError # p $! #=> # # p $!.incomplete_input? #=> false # end # # begin # ec.convert("abc\xA1") # ec.finish # rescue Encoding::InvalidByteSequenceError # p $! #=> # # p $!.incomplete_input? #=> true # end # def incomplete_input?: () -> bool # # Returns the bytes to be read again when Encoding::InvalidByteSequenceError # occurs. # def readagain_bytes: () -> String # # Returns the source encoding as an encoding object. # # Note that the result may not be equal to the source encoding of the encoding # converter if the conversion has multiple steps. # # ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP # begin # ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP. # rescue Encoding::UndefinedConversionError # p $!.source_encoding #=> # # p $!.destination_encoding #=> # # p $!.source_encoding_name #=> "UTF-8" # p $!.destination_encoding_name #=> "EUC-JP" # end # def source_encoding: () -> Encoding # # Returns the source encoding name as a string. # def source_encoding_name: () -> String end # # Raised by Encoding and String methods when a transcoding operation fails. # class UndefinedConversionError < EncodingError # # Returns the destination encoding as an encoding object. # def destination_encoding: () -> Encoding # # Returns the destination encoding name as a string. # def destination_encoding_name: () -> String # # Returns the one-character string which cause # Encoding::UndefinedConversionError. # # ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # begin # ec.convert("\xa0") # rescue Encoding::UndefinedConversionError # puts $!.error_char.dump #=> "\xC2\xA0" # p $!.error_char.encoding #=> # # end # def error_char: () -> String # # Returns the source encoding as an encoding object. # # Note that the result may not be equal to the source encoding of the encoding # converter if the conversion has multiple steps. # # ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP # begin # ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP. # rescue Encoding::UndefinedConversionError # p $!.source_encoding #=> # # p $!.destination_encoding #=> # # p $!.source_encoding_name #=> "UTF-8" # p $!.destination_encoding_name #=> "EUC-JP" # end # def source_encoding: () -> Encoding # # Returns the source encoding name as a string. # def source_encoding_name: () -> String end end # # Encoding conversion class. # class Encoding::Converter < Object type encoding = String | Encoding type decorator = "universal_newline" | "crlf_newline" | "cr_newline" | "xml_text_escape" | "xml_attr_content_escape" | "xml_attr_quote" type conversion_path = Array[[encoding, encoding] | decorator] type convert_result = :invalid_byte_sequence | :incomplete_input | :undefined_conversion | :after_output | :destination_buffer_full | :source_buffer_empty | :finished # # Returns the corresponding ASCII compatible encoding. # # Returns nil if the argument is an ASCII compatible encoding. # # "corresponding ASCII compatible encoding" is an ASCII compatible encoding # which can represents exactly the same characters as the given ASCII # incompatible encoding. So, no conversion undefined error occurs when # converting between the two encodings. # # Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> # # Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> # # Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil # def self.asciicompat_encoding: (encoding enc) -> Encoding? # # Returns a conversion path. # # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") # #=> [[#, #], # # [#, #]] # # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) # or # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) # #=> [[#, #], # # [#, #], # # "universal_newline"] # # p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) # or # p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) # #=> [[#, #], # # "universal_newline", # # [#, #]] # def self.search_convpath: ( encoding source, encoding destination, ?newline: :universal | :crlf | :cr, ?universal_newline: bool, ?crlf_newline: bool, ?cr_newline: bool, ?xml: :text | :attr ) -> conversion_path # # def ==: (self) -> bool # # Convert source_string and return destination_string. # # source_string is assumed as a part of source. i.e. :partial_input=>true is # specified internally. finish method should be used last. # # ec = Encoding::Converter.new("utf-8", "euc-jp") # puts ec.convert("\u3042").dump #=> "\xA4\xA2" # puts ec.finish.dump #=> "" # # ec = Encoding::Converter.new("euc-jp", "utf-8") # puts ec.convert("\xA4").dump #=> "" # puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" # puts ec.finish.dump #=> "" # # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") # puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") # puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") # puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") # puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") # # If a conversion error occur, Encoding::UndefinedConversionError or # Encoding::InvalidByteSequenceError is raised. Encoding::Converter#convert # doesn't supply methods to recover or restart from these exceptions. When you # want to handle these conversion errors, use # Encoding::Converter#primitive_convert. # def convert: (String source) -> String # # Returns the conversion path of ec. # # The result is an array of conversions. # # ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) # p ec.convpath # #=> [[#, #], # # [#, #], # # "crlf_newline"] # # Each element of the array is a pair of encodings or a string. A pair means an # encoding conversion. A string means a decorator. # # In the above example, [#, #] means a # converter from ISO-8859-1 to UTF-8. "crlf_newline" means newline converter # from LF to CRLF. # def convpath: () -> conversion_path # # Returns the destination encoding as an Encoding object. # def destination_encoding: () -> Encoding # # Finishes the converter. It returns the last part of the converted string. # # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") # p ec.convert("\u3042") #=> "\e$B$\"" # p ec.finish #=> "\e(B" # def finish: () -> String # # Inserts string into the encoding converter. The string will be converted to # the destination encoding and output on later conversions. # # If the destination encoding is stateful, string is converted according to the # state and the state is updated. # # This method should be used only when a conversion error occurs. # # ec = Encoding::Converter.new("utf-8", "iso-8859-1") # src = "HIRAGANA LETTER A is \u{3042}." # dst = "" # p ec.primitive_convert(src, dst) #=> :undefined_conversion # puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] # ec.insert_output("") # p ec.primitive_convert(src, dst) #=> :finished # puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is .", ""] # # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") # src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp # dst = "" # p ec.primitive_convert(src, dst) #=> :undefined_conversion # puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] # ec.insert_output "?" # state change required to output "?". # p ec.primitive_convert(src, dst) #=> :finished # puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""] # def insert_output: (String) -> nil # # Returns a printable version of *ec* # # ec = Encoding::Converter.new("iso-8859-1", "utf-8") # puts ec.inspect #=> # # def inspect: () -> String # # Returns an exception object for the last conversion. Returns nil if the last # conversion did not produce an error. # # "error" means that Encoding::InvalidByteSequenceError and # Encoding::UndefinedConversionError for Encoding::Converter#convert and # :invalid_byte_sequence, :incomplete_input and :undefined_conversion for # Encoding::Converter#primitive_convert. # # ec = Encoding::Converter.new("utf-8", "iso-8859-1") # p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence # p ec.last_error #=> # # p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full # p ec.last_error #=> nil # def last_error: () -> Encoding::InvalidByteSequenceError? | () -> Encoding::UndefinedConversionError? # # possible opt elements: # hash form: # :partial_input => true # source buffer may be part of larger source # :after_output => true # stop conversion after output before input # integer form: # Encoding::Converter::PARTIAL_INPUT # Encoding::Converter::AFTER_OUTPUT # # possible results: # :invalid_byte_sequence # :incomplete_input # :undefined_conversion # :after_output # :destination_buffer_full # :source_buffer_empty # :finished # # primitive_convert converts source_buffer into destination_buffer. # # source_buffer should be a string or nil. nil means an empty string. # # destination_buffer should be a string. # # destination_byteoffset should be an integer or nil. nil means the end of # destination_buffer. If it is omitted, nil is assumed. # # destination_bytesize should be an integer or nil. nil means unlimited. If it # is omitted, nil is assumed. # # opt should be nil, a hash or an integer. nil means no flags. If it is omitted, # nil is assumed. # # primitive_convert converts the content of source_buffer from beginning and # store the result into destination_buffer. # # destination_byteoffset and destination_bytesize specify the region which the # converted result is stored. destination_byteoffset specifies the start # position in destination_buffer in bytes. If destination_byteoffset is nil, # destination_buffer.bytesize is used for appending the result. # destination_bytesize specifies maximum number of bytes. If # destination_bytesize is nil, destination size is unlimited. After conversion, # destination_buffer is resized to destination_byteoffset + actually produced # number of bytes. Also destination_buffer's encoding is set to # destination_encoding. # # primitive_convert drops the converted part of source_buffer. the dropped part # is converted in destination_buffer or buffered in Encoding::Converter object. # # primitive_convert stops conversion when one of following condition met. # * invalid byte sequence found in source buffer (:invalid_byte_sequence) # `primitive_errinfo` and `last_error` methods returns the detail of the # error. # * unexpected end of source buffer (:incomplete_input) this occur only when # :partial_input is not specified. `primitive_errinfo` and `last_error` # methods returns the detail of the error. # * character not representable in output encoding (:undefined_conversion) # `primitive_errinfo` and `last_error` methods returns the detail of the # error. # * after some output is generated, before input is done (:after_output) this # occur only when :after_output is specified. # * destination buffer is full (:destination_buffer_full) this occur only when # destination_bytesize is non-nil. # * source buffer is empty (:source_buffer_empty) this occur only when # :partial_input is specified. # * conversion is finished (:finished) # # # example: # ec = Encoding::Converter.new("UTF-8", "UTF-16BE") # ret = ec.primitive_convert(src="pi", dst="", nil, 100) # p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] # # ec = Encoding::Converter.new("UTF-8", "UTF-16BE") # ret = ec.primitive_convert(src="pi", dst="", nil, 1) # p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] # ret = ec.primitive_convert(src, dst="", nil, 1) # p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] # ret = ec.primitive_convert(src, dst="", nil, 1) # p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] # ret = ec.primitive_convert(src, dst="", nil, 1) # p [ret, src, dst] #=> [:finished, "", "i"] # def primitive_convert: ( String? source, String destination, ?Integer? destination_byteoffset, ?Integer? destination_bytesize, ?partial_input: bool, ?after_output: bool ) -> convert_result | ( String? source, String destination, ?Integer? destination_byteoffset, ?Integer? destination_bytesize, ?Integer opt ) -> convert_result # # primitive_errinfo returns important information regarding the last error as a # 5-element array: # # [result, enc1, enc2, error_bytes, readagain_bytes] # # result is the last result of primitive_convert. # # Other elements are only meaningful when result is :invalid_byte_sequence, # :incomplete_input or :undefined_conversion. # # enc1 and enc2 indicate a conversion step as a pair of strings. For example, a # converter from EUC-JP to ISO-8859-1 converts a string as follows: EUC-JP -> # UTF-8 -> ISO-8859-1. So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or # ["UTF-8", "ISO-8859-1"]. # # error_bytes and readagain_bytes indicate the byte sequences which caused the # error. error_bytes is discarded portion. readagain_bytes is buffered portion # which is read again on next conversion. # # Example: # # # \xff is invalid as EUC-JP. # ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") # ec.primitive_convert(src="\xff", dst="", nil, 10) # p ec.primitive_errinfo # #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""] # # # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. # # Since this error is occur in UTF-8 to ISO-8859-1 conversion, # # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") # ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) # p ec.primitive_errinfo # #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] # # # partial character is invalid # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") # ec.primitive_convert(src="\xa4", dst="", nil, 10) # p ec.primitive_errinfo # #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] # # # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by # # partial characters. # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") # ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) # p ec.primitive_errinfo # #=> [:source_buffer_empty, nil, nil, nil, nil] # # # \xd8\x00\x00@ is invalid as UTF-16BE because # # no low surrogate after high surrogate (\xd8\x00). # # It is detected by 3rd byte (\00) which is part of next character. # # So the high surrogate (\xd8\x00) is discarded and # # the 3rd byte is read again later. # # Since the byte is buffered in ec, it is dropped from src. # ec = Encoding::Converter.new("UTF-16BE", "UTF-8") # ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) # p ec.primitive_errinfo # #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] # p src # #=> "@" # # # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. # # The problem is detected by 4th byte. # ec = Encoding::Converter.new("UTF-16LE", "UTF-8") # ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) # p ec.primitive_errinfo # #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] # p src # #=> "" # def primitive_errinfo: () -> [convert_result, String?, String?, String?, String?] # # Put back the bytes which will be converted. # # The bytes are caused by invalid_byte_sequence error. When # invalid_byte_sequence error, some bytes are discarded and some bytes are # buffered to be converted later. The latter bytes can be put back. It can be # observed by Encoding::InvalidByteSequenceError#readagain_bytes and # Encoding::Converter#primitive_errinfo. # # ec = Encoding::Converter.new("utf-16le", "iso-8859-1") # src = "\x00\xd8\x61\x00" # dst = "" # p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence # p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] # p ec.putback #=> "a\x00" # p ec.putback #=> "" # no more bytes to put back # def putback: (?Integer max_numbytes) -> String # # Returns the replacement string. # # ec = Encoding::Converter.new("euc-jp", "us-ascii") # p ec.replacement #=> "?" # # ec = Encoding::Converter.new("euc-jp", "utf-8") # p ec.replacement #=> "\uFFFD" # def replacement: () -> String # # Sets the replacement string. # # ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) # ec.replacement = "" # p ec.convert("a \u3042 b") #=> "a b" # def replacement=: (String str) -> String # # Returns the source encoding as an Encoding object. # def source_encoding: () -> Encoding private # # possible options elements: # hash form: # :invalid => nil # raise error on invalid byte sequence (default) # :invalid => :replace # replace invalid byte sequence # :undef => nil # raise error on undefined conversion (default) # :undef => :replace # replace undefined conversion # :replace => string # replacement string ("?" or "\uFFFD" if not specified) # :newline => :universal # decorator for converting CRLF and CR to LF # :newline => :lf # decorator for converting CRLF and CR to LF when writing # :newline => :crlf # decorator for converting LF to CRLF # :newline => :cr # decorator for converting LF to CR # :universal_newline => true # decorator for converting CRLF and CR to LF # :crlf_newline => true # decorator for converting LF to CRLF # :cr_newline => true # decorator for converting LF to CR # :lf_newline => true # decorator for converting CRLF and CR to LF when writing # :xml => :text # escape as XML CharData. # :xml => :attr # escape as XML AttValue # integer form: # Encoding::Converter::INVALID_REPLACE # Encoding::Converter::UNDEF_REPLACE # Encoding::Converter::UNDEF_HEX_CHARREF # Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR # Encoding::Converter::LF_NEWLINE_DECORATOR # Encoding::Converter::CRLF_NEWLINE_DECORATOR # Encoding::Converter::CR_NEWLINE_DECORATOR # Encoding::Converter::XML_TEXT_DECORATOR # Encoding::Converter::XML_ATTR_CONTENT_DECORATOR # Encoding::Converter::XML_ATTR_QUOTE_DECORATOR # # Encoding::Converter.new creates an instance of Encoding::Converter. # # Source_encoding and destination_encoding should be a string or Encoding # object. # # opt should be nil, a hash or an integer. # # convpath should be an array. convpath may contain # * two-element arrays which contain encodings or encoding names, or # * strings representing decorator names. # # # Encoding::Converter.new optionally takes an option. The option should be a # hash or an integer. The option hash can contain :invalid => nil, etc. The # option integer should be logical-or of constants such as # Encoding::Converter::INVALID_REPLACE, etc. # # :invalid => nil # : Raise error on invalid byte sequence. This is a default behavior. # :invalid => :replace # : Replace invalid byte sequence by replacement string. # :undef => nil # : Raise an error if a character in source_encoding is not defined in # destination_encoding. This is a default behavior. # :undef => :replace # : Replace undefined character in destination_encoding with replacement # string. # :replace => string # : Specify the replacement string. If not specified, "uFFFD" is used for # Unicode encodings and "?" for others. # :universal_newline => true # : Convert CRLF and CR to LF. # :crlf_newline => true # : Convert LF to CRLF. # :cr_newline => true # : Convert LF to CR. # :lf_newline => true # : Convert CRLF and CR to LF (when writing). # :xml => :text # : Escape as XML CharData. This form can be used as an HTML 4.0 #PCDATA. # * '&' -> '&' # * '<' -> '<' # * '>' -> '>' # * undefined characters in destination_encoding -> hexadecimal CharRef # such as &#xHH; # # :xml => :attr # : Escape as XML AttValue. The converted result is quoted as "...". This form # can be used as an HTML 4.0 attribute value. # * '&' -> '&' # * '<' -> '<' # * '>' -> '>' # * '"' -> '"' # * undefined characters in destination_encoding -> hexadecimal CharRef # such as &#xHH; # # # # Examples: # # UTF-16BE to UTF-8 # ec = Encoding::Converter.new("UTF-16BE", "UTF-8") # # # Usually, decorators such as newline conversion are inserted last. # ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) # p ec.convpath #=> [[#, #], # # "universal_newline"] # # # But, if the last encoding is ASCII incompatible, # # decorators are inserted before the last conversion. # ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) # p ec.convpath #=> ["crlf_newline", # # [#, #]] # # # Conversion path can be specified directly. # ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) # p ec.convpath #=> ["universal_newline", # # [#, #], # # [#, #]] # def initialize: (encoding source, encoding destination) -> void | (encoding source, encoding destination, ?invalid: :replace | nil, ?undef: :replace | nil, ?replace: String, ?newline: :universal | :crlf | :cr, ?universal_newline: bool, ?crlf_newline: bool, ?cr_newline: bool, ?xml: :text | :attr ) -> void | (encoding source, encoding destination, Integer opts) -> void | (conversion_path convpath) -> void end # # AFTER_OUTPUT # # Stop converting after some output is complete but before all of the input was # consumed. See primitive_convert for an example. # Encoding::Converter::AFTER_OUTPUT: Integer # # CRLF_NEWLINE_DECORATOR # # Decorator for converting LF to CRLF # Encoding::Converter::CRLF_NEWLINE_DECORATOR: Integer # # CR_NEWLINE_DECORATOR # # Decorator for converting LF to CR # Encoding::Converter::CR_NEWLINE_DECORATOR: Integer # # INVALID_MASK # # Mask for invalid byte sequences # Encoding::Converter::INVALID_MASK: Integer # # INVALID_REPLACE # # Replace invalid byte sequences # Encoding::Converter::INVALID_REPLACE: Integer # # PARTIAL_INPUT # # Indicates the source may be part of a larger string. See primitive_convert # for an example. # Encoding::Converter::PARTIAL_INPUT: Integer # # UNDEF_HEX_CHARREF # # Replace byte sequences that are undefined in the destination encoding with an # XML hexadecimal character reference. This is valid for XML conversion. # Encoding::Converter::UNDEF_HEX_CHARREF: Integer # # UNDEF_MASK # # Mask for a valid character in the source encoding but no related character(s) # in destination encoding. # Encoding::Converter::UNDEF_MASK: Integer # # UNDEF_REPLACE # # Replace byte sequences that are undefined in the destination encoding. # Encoding::Converter::UNDEF_REPLACE: Integer # # UNIVERSAL_NEWLINE_DECORATOR # # Decorator for converting CRLF and CR to LF # Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR: Integer # # XML_ATTR_CONTENT_DECORATOR # # Escape as XML AttValue # Encoding::Converter::XML_ATTR_CONTENT_DECORATOR: Integer # # XML_ATTR_QUOTE_DECORATOR # # Escape as XML AttValue # Encoding::Converter::XML_ATTR_QUOTE_DECORATOR: Integer # # XML_TEXT_DECORATOR # # Escape as XML CharData # Encoding::Converter::XML_TEXT_DECORATOR: Integer