Sha256: 4f9c3313ce5e912d04d61367b9bde6277e31337e7783b4af36ab792f4af47d38
Contents?: true
Size: 1.4 KB
Versions: 1
Compression:
Stored size: 1.4 KB
Contents
# @name twkorean-ruby # @author JunSangPil # @version 0.0.4 # @url https://github.com/jun85664396/twkorean-ruby # @license Apache License 2.0 module Twkorean class TwitterKoreanText attr_accessor :korean_processor def initialize(normalization = true, stemming = true) jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':') Rjb::load(jars, ['-Xmx512M']) self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava') end def normalize(text) self.korean_processor.normalize(text).toString end def tokenize(text) tokens = self.korean_processor.tokenize(text) tokens end def tokens_to_string_list(tokens) tokens = self.korean_processor.tokensToJavaStringList(tokens) tokens.toArray.map{|x| x.toString} end def tokens_to_token_list(tokens) tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens) tokens.toArray.map{|x| self.parser(x.toString)} end def stem(tokens) stemmed = self.korean_processor.stem(tokens) stemmed end def extract_phrases(tokens) phrases = self.korean_processor.extractPhrases(tokens, true, true) phrases.toArray.map{|x| x.toString} end def parser(text) text.match(/(.*)\(([a-zA-Z]*): ([0-9]+), ([0-9]+)\)/).to_a end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
twkorean-0.0.4 | lib/twkorean/twitter_korean_text.rb |