Sha256: 91c4e1b6ed769ea1cf10f3a0d44b8cc419d89e3770b28e2902b8098e0c85f87d

Contents?: true

Size: 1.58 KB

Versions: 2

Compression:

Stored size: 1.58 KB

Contents

# frozen_string_literal: true

module PragmaticSegmenter
  module Languages
    module Japanese
      include Languages::Common

      class Cleaner < PragmaticSegmenter::Cleaner
        # Rubular: http://rubular.com/r/N4kPuJgle7
        NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '')

        def clean
          super
          remove_newline_in_middle_of_word
        end

        private

        def remove_newline_in_middle_of_word
          Rule.apply @text, NewLineInMiddleOfWordRule
        end
      end

      class AbbreviationReplacer < AbbreviationReplacer
        SENTENCE_STARTERS = [].freeze
      end

      class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
        # Rubular: http://rubular.com/r/GnjOmry5Z2
        BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/

        # Rubular: http://rubular.com/r/EjHcZn5ZSG
        BETWEEN_PARENS_JA_REGEX = /\u{ff08}(?>[^\u{ff08}\u{ff09}\\]+|\\{2}|\\.)*\u{ff09}/
        private

        def sub_punctuation_between_quotes_and_parens(txt)
          super
          sub_punctuation_between_parens_ja(txt)
          sub_punctuation_between_quotes_ja(txt)
        end

        def sub_punctuation_between_quotes_ja(txt)
          PunctuationReplacer.new(
            matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX),
            text: txt
          ).replace
        end

        def sub_punctuation_between_parens_ja(txt)
          PunctuationReplacer.new(
            matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX),
            text: txt
          ).replace
        end
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
pragmatic_segmenter-0.3.24 lib/pragmatic_segmenter/languages/japanese.rb
pragmatic_segmenter-0.3.23 lib/pragmatic_segmenter/languages/japanese.rb