Sha256: 8a344855ea03d00e1929211ec78828d2c92775ca59b5ebe270085faa927fb753

Contents?: true

Size: 1.14 KB

Versions: 5

Compression:

Stored size: 1.14 KB

Contents

# frozen_string_literal: true

module PragmaticSegmenter
  module Languages
    module Chinese
      include Languages::Common

      class AbbreviationReplacer < AbbreviationReplacer
        SENTENCE_STARTERS = [].freeze
      end

      class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
        BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = /《(?>[^》\\]+|\\{2}|\\.)*》/
        BETWEEN_L_BRACKET_REGEX = /「(?>[^」\\]+|\\{2}|\\.)*」/
        private

        def sub_punctuation_between_quotes_and_parens(txt)
          super
          sub_punctuation_between_double_angled_quotation_marks(txt)
          sub_punctuation_between_l_bracket(txt)
        end
        
        def sub_punctuation_between_double_angled_quotation_marks(txt)
          PunctuationReplacer.new(
            matches_array: txt.scan(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX),
            text: txt
          ).replace
        end
        
        def sub_punctuation_between_l_bracket(txt)
          PunctuationReplacer.new(
            matches_array: txt.scan(BETWEEN_L_BRACKET_REGEX),
            text: txt
          ).replace
        end
      end
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
pragmatic_segmenter-0.3.24 lib/pragmatic_segmenter/languages/chinese.rb
pragmatic_segmenter-0.3.23 lib/pragmatic_segmenter/languages/chinese.rb
pragmatic_segmenter-0.3.22 lib/pragmatic_segmenter/languages/chinese.rb
pragmatic_segmenter-0.3.21 lib/pragmatic_segmenter/languages/chinese.rb
pragmatic_segmenter-0.3.20 lib/pragmatic_segmenter/languages/chinese.rb