Sha256: eb96e54998ad1b8edce17bea80c512d8301663f40e2a5fad57efe39360be0c32

Contents?: true

Size: 1.57 KB

Versions: 1

Compression:

Stored size: 1.57 KB

Contents

# encoding: utf-8

module Okei

  # Converts source string to standardized line of text with:
  #
  # * cyrillic letters un upper register
  # * numbers
  # * ^ - a symbol of word break
  # * slashes
  # * spaces
  # * %
  #
  #     Line.new "1000шт в условн.эквивал."
  #     # => "1000 ШТ В УСЛОВН ЭКВИВАЛ"
  #
  class Line < String

    def initialize(str = "")
      super str.to_s.mb_chars.upcase
      to_cyrillic
      remove_dots_and_commas
      clear_text
      add_breaks
    end

    private

    # Converts latin letters to corresponding cyrillic
    def to_cyrillic
      {
        "A" => "А", "B" => "В", "C" => "С", "D" => "Д", "E" => "Е",
        "G" => "Д", "H" => "Н", "K" => "К", "M" => "М", "N" => "П",
        "O" => "О", "P" => "Р", "Q" => "Д", "R" => "Г", "T" => "Т",
        "U" => "И", "X" => "Х", "Y" => "У"
      }.each { |lat, cyr| gsub! lat, cyr }
    end

    # Remove dots and commas except for dots between digits
    def remove_dots_and_commas
      gsub!(/\.+/, ",")
      gsub!(/(\d),(\d)/, '\1.\2')
      gsub!(/,/, " ")
    end

    # Removes unwanted symbols.
    def clear_text
      gsub!(/\%-[^\s]+/, "%")
      gsub!(/[^А-Я\d\.\s\/\%]/, " ")
      gsub!(/\A[^А-Я\d]+|[^А-Я\d]+\z/, " ")
      gsub!(/\s{2,}/, " ")
      strip!
    end

    # Add ^ to mark word breaks
    def add_breaks
      gsub!(
        /\A(НАНО|МИКРО|МИЛЛИ|САНТИ|ПОЛ|ДЕКА|ДЕЦИ|ГЕКТО|КИЛО|МЕГА|ТЕРА)/, '\1 ^ '
      )
      gsub!(/([А-Я])[\s\.^]*(2|3)(?!\d)/, '\1 ^ \2')
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
okei-0.0.2 app/models/okei/line.rb