Sha256: 3f4a283ee9256921d42fdfb412acc73285b88511f8317585d3a2fdf73e03579d

Contents?: true

Size: 1.11 KB

Versions: 9

Compression:

Stored size: 1.11 KB

Contents

module PragmaticSegmenter
  module Languages
    module Arabic
      include Languages::Common

      Punctuations = ['?', '!', ':', '.', '؟', '،'].freeze
      SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/

      module Abbreviation
        ABBREVIATIONS = Set.new(['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د‪']).freeze
        PREPOSITIVE_ABBREVIATIONS = [].freeze
        NUMBER_ABBREVIATIONS = [].freeze
      end

      # Rubular: http://rubular.com/r/RX5HpdDIyv
      ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')

      # Rubular: http://rubular.com/r/kPRgApNHUg
      ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')

      class AbbreviationReplacer < AbbreviationReplacer
        SENTENCE_STARTERS = [].freeze
        private

        def scan_for_replacements(txt, am, index, character_array)
          txt.gsub!(/(?<=#{am})\./, '∯')
          txt
        end
      end
    end
  end
end

Version data entries

9 entries across 9 versions & 1 rubygems

Version Path
pragmatic_segmenter-0.3.17 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.16 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.15 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.14 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.13 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.12 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.10 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.9 lib/pragmatic_segmenter/languages/arabic.rb
pragmatic_segmenter-0.3.8 lib/pragmatic_segmenter/languages/arabic.rb