Sha256: a23bcb3375805af75ea6bfb2c99155274d667e22f36d8187ad33e47a6078b68c

Contents?: true

Size: 1.48 KB

Versions: 1

Compression:

Stored size: 1.48 KB

Contents

require "arabic_stemmer/version"

module ArabicStemmer
  def self.to_arabic_stem(word)
    key_words = ["الله"]

    # 1. Remove non alpha numeric characters.
    word = word.strip.gsub(/[._,،\"\':;&?؟()]/, '')

    # 2. Remove diacratical marks ً َ ُ ٌ ٍ ِ ْ ّ
     word = word.gsub(/[\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652]/, '')
    # word = word.gsub(/[\uFE77]/, '')

    # 3. Convert أآإ to ا
    word = word.gsub(/[\u0622\u0623\u0625]/, 'ا')

    # 4. check against some words that should not be stemmed
    return word if key_words.include? word or is_word_short(word)

    # 5. remove prefixes
    word = remove_prefix(word)

    return word if is_word_short(word)
    
    # 6. remove suffixes
    word = remove_suffix(word)
    
    return word
  end
  
  def self.remove_suffix(word)
    # Remove ات ان ون كم ين هم هن
    suffixes = ["هم", "ين", "ون", "ان", "كم", "ات", "هن"]
    letters = word[-2, 2]
    word.slice!(letters) if suffixes.include? letters
    return word
  end

  def self.remove_prefix(word)
    # Remove ال وال كال سي ست, al, wal, kal, saya, sata
    # check if the word is >= 5
    prefix_2 = ["ال", "سي", "ست", "لل"]
    letters = word[0..1]
    return word[2..-1] if prefix_2.include? letters

    prefix_3 = ["وال", "كال", "بال"]
    letters = word[0..2]
    return word[3..-1] if prefix_3.include? letters
    return word
  end

  def self.is_word_short(word)
    return word.size <= 4
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
arabic_stemmer-0.0.2 lib/arabic_stemmer.rb