Sha256: e1b5abd018684289ad7fd014895eff83a6fcb2e613c3d7c28b531a194118a0e6
Contents?: true
Size: 1.08 KB
Versions: 2
Compression:
Stored size: 1.08 KB
Contents
module PragmaticSegmenter module Languages module Arabic include Languages::Common Punctuations = ['?', '!', ':', '.', '؟', '،'] SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/ module Abbreviation ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د'] PREPOSITIVE_ABBREVIATIONS = [] NUMBER_ABBREVIATIONS = [] end # Rubular: http://rubular.com/r/RX5HpdDIyv ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭') # Rubular: http://rubular.com/r/kPRgApNHUg ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬') class AbbreviationReplacer < AbbreviationReplacer SENTENCE_STARTERS = [].freeze private def scan_for_replacements(txt, am, index, character_array) txt.gsub!(/(?<=#{am})\./, '∯') txt end end end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
pragmatic_segmenter-0.3.7 | lib/pragmatic_segmenter/languages/arabic.rb |
pragmatic_segmenter-0.3.6 | lib/pragmatic_segmenter/languages/arabic.rb |