Sha256: 5b7c3b09dca0c9ebf37e02d3dc596ce767c63d9fbc458edc43c77b164cbdb393

Contents?: true

Size: 1.14 KB

Versions: 1

Compression:

Stored size: 1.14 KB

Contents

module Twkorean
  class TwitterKoreanText

    attr_accessor :korean_processor

    def initialize(normalization = true, stemming = true)
      jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':')
      Rjb::load(jars, ['-Xmx512M'])
      korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder').new
      unless normalization
        korean_processor.disableNormalizer
      end
      unless stemming
        korean_processor.disableStemmer
      end
        self.korean_processor = korean_processor.build
    end

    def normalize(text)
      self.korean_processor.normalize(text).toString
    end

    def tokenize(text)
      tokens = self.korean_processor.tokenize(text)
      return [] unless tokens
      tokens.toArray.map{|x| x.toString}
    end

    def tokenize_to_strings(text)
      tokens = self.korean_processor.tokenizeToStrings(text)
      return [] unless tokens
      tokens.toArray.map{|x| x.toString}
    end

    def extract_phrases(text)
      phrases = self.korean_processor.extractPhrases(text)
      return [] unless phrases
      phrases.toArray.map{|x| x.toString}
    end

  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
twkorean-0.0.1 lib/twkorean/twitter_korean_text.rb