Sha256: 5b7c3b09dca0c9ebf37e02d3dc596ce767c63d9fbc458edc43c77b164cbdb393
Contents?: true
Size: 1.14 KB
Versions: 1
Compression:
Stored size: 1.14 KB
Contents
module Twkorean class TwitterKoreanText attr_accessor :korean_processor def initialize(normalization = true, stemming = true) jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':') Rjb::load(jars, ['-Xmx512M']) korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder').new unless normalization korean_processor.disableNormalizer end unless stemming korean_processor.disableStemmer end self.korean_processor = korean_processor.build end def normalize(text) self.korean_processor.normalize(text).toString end def tokenize(text) tokens = self.korean_processor.tokenize(text) return [] unless tokens tokens.toArray.map{|x| x.toString} end def tokenize_to_strings(text) tokens = self.korean_processor.tokenizeToStrings(text) return [] unless tokens tokens.toArray.map{|x| x.toString} end def extract_phrases(text) phrases = self.korean_processor.extractPhrases(text) return [] unless phrases phrases.toArray.map{|x| x.toString} end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
twkorean-0.0.1 | lib/twkorean/twitter_korean_text.rb |