Sha256: a6bfe5307b5bec6933026ac69e7c6257c037df2454c72a1c016c7d3aedbe4af0
Contents?: true
Size: 1.34 KB
Versions: 1
Compression:
Stored size: 1.34 KB
Contents
# @name twkorean-ruby # @author JunSangPil # @version 0.0.2 # @url https://github.com/jun85664396/twkorean-ruby # @license Apache License 2.0 module Twkorean class TwitterKoreanText attr_accessor :korean_processor def initialize(normalization = true, stemming = true) jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':') Rjb::load(jars, ['-Xmx512M']) korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder').new unless normalization korean_processor.disableNormalizer end unless stemming korean_processor.disableStemmer end self.korean_processor = korean_processor.build end def normalize(text) self.korean_processor.normalize(text).toString end def tokenize(text) tokens = self.korean_processor.tokenize(text) return [] unless tokens tokens.toArray.map{|x| x.toString} end def tokenize_to_strings(text) tokens = self.korean_processor.tokenizeToStrings(text) return [] unless tokens tokens.toArray.map{|x| x.toString} end def extract_phrases(text) phrases = self.korean_processor.extractPhrases(text) return [] unless phrases phrases.toArray.map{|x| x.toString} end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
twkorean-0.0.2 | lib/twkorean/twitter_korean_text.rb |