Sha256: 6e7cc7fb4ddc5ba07e53dae1c5b557388ef790d97f5ee32c8fd8a41b6a8a0c6b
Contents?: true
Size: 1.54 KB
Versions: 3
Compression:
Stored size: 1.54 KB
Contents
module PragmaticSegmenter module Languages class Persian class Process < PragmaticSegmenter::Process private def sentence_boundary_punctuation(txt) PragmaticSegmenter::Languages::Persian::SentenceBoundaryPunctuation.new(text: txt).split end def replace_abbreviations(txt) PragmaticSegmenter::Languages::Persian::AbbreviationReplacer.new(text: txt).replace end end class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation SENTENCE_BOUNDARY = /.*?[:\.!\?؟]|.*?\z|.*?$/ ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭') ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬') def split txt = replace_non_sentence_boundary_punctuation(text) txt.scan(SENTENCE_BOUNDARY) end private def replace_non_sentence_boundary_punctuation(txt) txt.apply(ReplaceColonBetweenNumbersRule). apply(ReplaceNonSentenceBoundaryCommaRule) end end class Punctuation < PragmaticSegmenter::Punctuation PUNCT = ['?', '!', ':', '.', '؟'] def punct PUNCT end end class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer private def scan_for_replacements(txt, am, index, character_array, abbr) replace_abbr(txt, am) end def replace_abbr(txt, abbr) txt.gsub(/(?<=#{abbr})\./, '∯') end end end end end
Version data entries
3 entries across 3 versions & 1 rubygems