# encoding: utf-8
module LogStash::Util::UnicodeTrimmer
  # The largest possible unicode chars are 4 bytes
  # http://stackoverflow.com/questions/9533258/what-is-the-maximum-number-of-bytes-for-a-utf-8-encoded-character
  # http://tools.ietf.org/html/rfc3629
  MAX_CHAR_BYTES = 4

  # Takes a unicode string and makes sure it fits in a max of `desired_bytes`
  # This aims to be somewhat efficient about this for the average case and get as close to
  # O(1) as possible. Given certain distributions of multi-byte characters it'll be slower
  # It tries to find the point the truncation *should* happen based on the average byte size.
  # If that snips it in the wrong place it'll try to add or remove chars to get it to the right
  # spot and preserve as much data as possible.
  public
  def self.trim_bytes(orig_str, desired_bytes)
    return orig_str if orig_str.bytesize <= desired_bytes

    pre_shortened = pre_shorten(orig_str, desired_bytes)

    case pre_shortened.bytesize <=> desired_bytes
    when 0
      pre_shortened
    when 1
      shrink_bytes(pre_shortened, orig_str, desired_bytes)
    when -1
      grow_bytes(pre_shortened, orig_str, desired_bytes)
    end
  end

  private
  # Try to cut the string at the right place based on the avg. byte size
  def self.pre_shorten(orig_str, desired_bytes)
    # Compute the average size to get an idea of where should chop
    orig_len = orig_str.length
    orig_bs = orig_str.bytesize
    avg_size = (orig_bs.to_f / orig_len.to_f)

    # Try to do an initial shortening based on the average char size
    # The goal here is to get us somewhere above or below the boundary quickly
    orig_extra_bytes = orig_bs - desired_bytes
    pre_shorten_by = (orig_extra_bytes  / avg_size).to_i
    orig_str.slice(0, orig_len - pre_shorten_by)
  end

  private
  def self.grow_bytes(pre_shortened, orig_str, desired_bytes)
    res_str = pre_shortened.clone()

    loop do
      bs = res_str.bytesize
      deficit = desired_bytes - bs
      lengthen_by = deficit / MAX_CHAR_BYTES
      lengthen_by = 1 if lengthen_by < 1
      append = orig_str.slice(res_str.length, lengthen_by)

      break if (bs + append.bytesize) > desired_bytes

      res_str << append
    end

    res_str
  end

  private
  def self.shrink_bytes(pre_shortened, orig_str, desired_bytes)
    res_str = pre_shortened.clone()

    loop do
      bs = res_str.bytesize
      break if bs <= desired_bytes

      extra = bs - desired_bytes
      shorten_by = extra / MAX_CHAR_BYTES
      shorten_by = 1 if shorten_by < 1

      res_str.slice!(res_str.length - shorten_by)
    end

    res_str
  end
end