lib/pragmatic_segmenter/languages/chinese.rb in pragmatic_segmenter-0.3.19 vs lib/pragmatic_segmenter/languages/chinese.rb in pragmatic_segmenter-0.3.20
- old
+ new
@@ -6,8 +6,34 @@
include Languages::Common
class AbbreviationReplacer < AbbreviationReplacer
SENTENCE_STARTERS = [].freeze
end
+
+ class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
+ BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = /《(?>[^》\\]+|\\{2}|\\.)*》/
+ BETWEEN_L_BRACKET_REGEX = /「(?>[^」\\]+|\\{2}|\\.)*」/
+ private
+
+ def sub_punctuation_between_quotes_and_parens(txt)
+ super
+ sub_punctuation_between_double_angled_quotation_marks(txt)
+ sub_punctuation_between_l_bracket(txt)
+ end
+
+ def sub_punctuation_between_double_angled_quotation_marks(txt)
+ PunctuationReplacer.new(
+ matches_array: txt.scan(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX),
+ text: txt
+ ).replace
+ end
+
+ def sub_punctuation_between_l_bracket(txt)
+ PunctuationReplacer.new(
+ matches_array: txt.scan(BETWEEN_L_BRACKET_REGEX),
+ text: txt
+ ).replace
+ end
+ end
end
end
end