Sha256: d48995e5193a9266b5f175411636d5b0c0a0919cca19590c0495dab30de4413a

Contents?: true

Size: 1.43 KB

Versions: 1

Compression:

Stored size: 1.43 KB

Contents

require 'rjb'

module TwitterKorean
  # Ruby interface to Scala TwitterKoreanProcessor
  class Processor
    attr_reader :jvm_processor, :java_convertor

    def initialize(*jvmargs)
      bridge = TwitterKorean::JvmBridge.new(jvmargs)
      @jvm_processor = bridge.scala_twitter_korean_processor
    end

    def normalize(text)
      return unless text
      jvm_processor.normalize(text).toString
    end

    def tokenize(text)
      return unless text
      converto_to_korean_tokens do
        jvm_processor.tokenize(text)
      end
    end

    def stem(text)
      return unless text
      converto_to_korean_tokens do
        jvm_processor.stem(jvm_processor.tokenize(text))
      end
    end

    def extract_phrases(text, options = {})
      return unless text
      filter_spam = options[:filter_spam] || false
      including_hashtags = options[:including_hashtags] || true      
      converto_to_korean_tokens do
        jvm_processor.extractPhrases(jvm_processor.tokenize(text), filter_spam, including_hashtags)
      end
    end

    private

    def converto_to_korean_tokens &block
      scala_list = block.call.toString
      token_strs = scala_list_to_array(scala_list)
      token_strs.map do |formed_token_str|
        TwitterKorean::KoreanToken.build_by_formed_str(formed_token_str.first)
      end
    end

    def scala_list_to_array(result)
      result.scan(/(?<=List\(|\,\s)(.*?\([a-zA-Z]+\:\s[0-9]+,\s[0-9]\))/).to_a
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
twitter-korean-text-ruby-0.9.1 lib/twitter_korean/processor.rb