Sha256: 6e7cc7fb4ddc5ba07e53dae1c5b557388ef790d97f5ee32c8fd8a41b6a8a0c6b

Contents?: true

Size: 1.54 KB

Versions: 3

Compression:

Stored size: 1.54 KB

Contents

module PragmaticSegmenter
  module Languages
    class Persian
      class Process < PragmaticSegmenter::Process
        private

        def sentence_boundary_punctuation(txt)
          PragmaticSegmenter::Languages::Persian::SentenceBoundaryPunctuation.new(text: txt).split
        end

        def replace_abbreviations(txt)
          PragmaticSegmenter::Languages::Persian::AbbreviationReplacer.new(text: txt).replace
        end
      end

      class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
        SENTENCE_BOUNDARY = /.*?[:\.!\?؟]|.*?\z|.*?$/

        ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
        ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')

        def split
          txt = replace_non_sentence_boundary_punctuation(text)
          txt.scan(SENTENCE_BOUNDARY)
        end

        private

        def replace_non_sentence_boundary_punctuation(txt)
          txt.apply(ReplaceColonBetweenNumbersRule).
              apply(ReplaceNonSentenceBoundaryCommaRule)
        end
      end

      class Punctuation < PragmaticSegmenter::Punctuation
        PUNCT = ['?', '!', ':', '.', '؟']

        def punct
          PUNCT
        end
      end

      class AbbreviationReplacer  < PragmaticSegmenter::AbbreviationReplacer
        private

        def scan_for_replacements(txt, am, index, character_array, abbr)
          replace_abbr(txt, am)
        end

        def replace_abbr(txt, abbr)
          txt.gsub(/(?<=#{abbr})\./, '∯')
        end
      end
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
pragmatic_segmenter-0.0.3 lib/pragmatic_segmenter/languages/persian.rb
pragmatic_segmenter-0.0.2 lib/pragmatic_segmenter/languages/persian.rb
pragmatic_segmenter-0.0.1 lib/pragmatic_segmenter/languages/persian.rb