Sha256: d714fe54bd4ca64e67d67325fc3e609f29e35f3dd677d47e047b3dd4f6b350fc

Contents?: true

Size: 1.48 KB

Versions: 6

Compression:

Stored size: 1.48 KB

Contents

##
# This class Reads individual characters from files using UTF-8 encoding
# This deals with two main concerns:
#   - Variable byte length of characters
#   - Reducing the number of read operations by loading bytes in chunks

class FormatParser::UTF8Reader
  READ_CHUNK_SIZE = 128

  class UTF8CharReaderError < StandardError
  end

  def initialize(io)
    @io = io
    @chunk = ""
    @index = 0
    @eof = false
  end

  def read_char
    first_byte = read_byte
    return if first_byte.nil?

    char_length = assess_char_length(first_byte)
    as_bytes = Array.new(char_length) do |i|
      next first_byte if i == 0
      read_byte
    end

    char = as_bytes.pack('c*').force_encoding('UTF-8')
    raise UTF8CharReaderError, "Invalid UTF-8 character" unless char.valid_encoding?

    char
  rescue TypeError
    raise UTF8CharReaderError, "Invalid UTF-8 character"
  end

  private

  def read_byte
    manage_data_chunk
    return if @chunk.nil?
    byte = @chunk.bytes[@index]
    @index += 1 unless byte.nil?
    byte
  end

  def manage_data_chunk
    return if @index < @chunk.length
    @chunk = @io.read(READ_CHUNK_SIZE)
    @chunk ||= ""
    @index = 0
    @eof = true if @chunk.nil? or @chunk.length < READ_CHUNK_SIZE
  end

  def assess_char_length(first_byte)
    # 0_______ (1 byte)
    # 110_____ (2 bytes) 192
    # 1110____ (3 bytes) 224
    # 11110___ (4 bytes) 240
    case first_byte
    when 240.. then 4
    when 224..239 then 3
    when 192..223 then 2
    else 1
    end
  end
end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
format_parser-2.10.0 lib/utf8_reader.rb
format_parser-2.9.0 lib/utf8_reader.rb
format_parser-2.8.0 lib/utf8_reader.rb
format_parser-2.7.2 lib/utf8_reader.rb
format_parser-2.7.1 lib/utf8_reader.rb
format_parser-2.7.0 lib/utf8_reader.rb