lib/sastrawi/stemmer/stemmer.rb in sastrawi-0.1.0 vs lib/sastrawi/stemmer/stemmer.rb in sastrawi-0.1.1
- old
+ new
@@ -2,20 +2,27 @@
require 'sastrawi/stemmer/context/visitor/visitor_provider'
require 'sastrawi/stemmer/filter/text_normalizer'
+##
+# Indonesian Stemmer
+# Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
+
module Sastrawi
module Stemmer
class Stemmer
attr_reader :dictionary, :visitor_provider
def initialize(dictionary)
@dictionary = dictionary
@visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
end
+ ##
+ # Stem a string to its base form
+
def stem(text)
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
words = normalized_text.split(' ')
stems = []
@@ -25,10 +32,13 @@
end
stems.join(' ')
end
+ ##
+ # Stem a word to its base form
+
def stem_word(word)
if plural?(word)
stem_plural_word(word)
else
stem_singular_word(word)
@@ -41,10 +51,15 @@
return matches[1].include?('-') if matches
return word.include?('-')
end
+ ##
+ # Stem a plural word to its base form
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
+ # page 76-77
+
def stem_plural_word(word)
first_match = /^(.*)-(.*)$/.match(word)
return word unless first_match
@@ -69,9 +84,12 @@
root_first_word
else
word
end
end
+
+ ##
+ # Stem a singular word to its base form
def stem_singular_word(word)
context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
context.execute