require 'sastrawi/stemmer/context/context' require 'sastrawi/stemmer/context/visitor/visitor_provider' require 'sastrawi/stemmer/filter/text_normalizer' ## # Indonesian Stemmer # Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS module Sastrawi module Stemmer class Stemmer attr_reader :dictionary, :visitor_provider def initialize(dictionary) @dictionary = dictionary @visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new end ## # Stem a string to its base form def stem(text) normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text) words = normalized_text.split(' ') stems = [] words.each do |word| stems.push(stem_word(word)) end stems.join(' ') end ## # Stem a word to its base form def stem_word(word) if plural?(word) stem_plural_word(word) else stem_singular_word(word) end end def plural?(word) matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word) return matches[1].include?('-') if matches return word.include?('-') end ## # Stem a plural word to its base form # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" # page 76-77 def stem_plural_word(word) first_match = /^(.*)-(.*)$/.match(word) return word unless first_match words = [first_match[1], first_match[2]] suffix = words[1] suffixes = %w[ku mu nya lah kah tah pun] second_match = /^(.*)-(.*)$/.match(words[0]) if suffixes.include?(suffix) && second_match words[0] = second_match[1] words[1] = second_match[2] << '-' << suffix end root_first_word = stem_singular_word(words[0]) root_second_word = stem_singular_word(words[1]) if !@dictionary.contains?(words[1]) && root_second_word == words[1] root_second_word = stem_singular_word('me' << words[1]) end if root_first_word == root_second_word root_first_word else word end end ## # Stem a singular word to its base form def stem_singular_word(word) context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider) context.execute context.result end end end end