Sha256: 6659ce7f1d39b5d6b75ab8846cb867ce93d1948459df0f47317bd41e0b124251

Contents?: true

Size: 1.86 KB

Versions: 23

Compression:

Stored size: 1.86 KB

Contents

require 'strscan'

module UTF8Util
  HIGH_BIT_RANGE = /[\x80-\xff]/

  # Check if this String is valid UTF-8
  #
  # Returns true or false.
  def self.valid?(str)
    sc = StringScanner.new(str)

    while sc.skip_until(HIGH_BIT_RANGE)
      sc.pos -= 1

      if !sequence_length(sc)
        return false
      end
    end

    true
  end

  # Replace invalid UTF-8 character sequences with a replacement character
  #
  # Returns self as valid UTF-8.
  def self.clean!(str)
    sc = StringScanner.new(str)
    while sc.skip_until(HIGH_BIT_RANGE)
      pos = sc.pos = sc.pos-1

      if !sequence_length(sc)
        str[pos] = REPLACEMENT_CHAR
      end
    end

    str
  end

  # Validate the UTF-8 sequence at the current scanner position.
  #
  # scanner - StringScanner instance so we can advance the pointer as we verify.
  #
  # Returns The length in bytes of this UTF-8 sequence, false if invalid.
  def self.sequence_length(scanner)
    leader = scanner.get_byte[0]

    if (leader >> 5) == 0x6
      if check_next_sequence(scanner)
        return 2
      else
        scanner.pos -= 1
      end
    elsif (leader >> 4) == 0x0e
      if check_next_sequence(scanner)
        if check_next_sequence(scanner)
          return 3
        else
          scanner.pos -= 2
        end
      else
        scanner.pos -= 1
      end
    elsif (leader >> 3) == 0x1e
      if check_next_sequence(scanner)
        if check_next_sequence(scanner)
          if check_next_sequence(scanner)
            return 4
          else
            scanner.pos -= 3
          end
        else
          scanner.pos -= 2
        end
      else
        scanner.pos -= 1
      end
    end

    false
  end

  private

  # Read another byte off the scanner oving the scan position forward one place
  #
  # Returns nothing.
  def self.check_next_sequence(scanner)
    byte = scanner.get_byte[0]
    (byte >> 6) == 0x2
  end
end

Version data entries

23 entries across 23 versions & 7 rubygems

Version Path
resque_sqs-1.25.2 lib/resque_sqs/vendor/utf8_util/utf8_util_18.rb
enju_leaf-1.2.1 vendor/bundle/ruby/2.3/gems/resque-1.27.4/lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.27.4 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.27.3 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.27.2 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.27.1 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.27.0 lib/resque/vendor/utf8_util/utf8_util_18.rb
resqueue-1.0.0 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-master-0.0.3 lib/resque/vendor/utf8_util/utf8_util_18.rb
ish_lib_manager-0.0.1 test/dummy/vendor/bundle/ruby/2.3.0/gems/resque-1.26.0/lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.26.0 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.25.2 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.26.pre.0 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque_signal_from_child-1.25.1 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.25.1 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.25.0 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.25.0.pre lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.24.1 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.24.0 lib/resque/vendor/utf8_util/utf8_util_18.rb
resque-1.23.1 lib/resque/vendor/utf8_util/utf8_util_18.rb