Sha256: efe6c19d58ecbea5423e49e3768221232bfd0beb913a74b8ca1487545df6dfd5

Contents?: true

Size: 1.57 KB

Versions: 2

Compression:

Stored size: 1.57 KB

Contents

#!/usr/bin/env ruby
# coding: utf-8
require 'rjb'
class Rjb::Rjb_JavaProxy
  def to_kuromoji_hash
    result = {}
    result['parts_of_speech'] = part_of_speech
    result['reading'] = getReading
    result['base_form'] = base_form
    result['surface_form'] = surface_form
    result['position'] = position
    result['is_known'] = isKnown
    result['is_user'] = isUser
    result
  end
end

module Kuromoji
  class Core

    def initialize(user_dictionary = nil)
      jar = File.expand_path('../../../vendor/kuromoji-0.7.7/lib/kuromoji-0.7.7.jar', __FILE__)
      Rjb.load(jar)
      if user_dictionary.nil?
        @tokenizer = Rjb.import('org.atilika.kuromoji.Tokenizer').builder.build
      else
        @tokenizer = Rjb.import('org.atilika.kuromoji.Tokenizer').builder.userDictionary(user_dictionary).build
      end
    end

    def tokenize(sentence)
      process(:all_features, sentence)
    end

    def reading(sentence)
      process(:getReading, sentence)
    end

    def tokenize_with_hash(sentence)
      result = []
      return result if sentence.nil?
      list = @tokenizer.tokenize(sentence)
      iterator = list.iterator
      while iterator.has_next
        item = iterator.next
        result << item.to_kuromoji_hash
      end
      result
    end

    def process(method, sentence)
      tokenized = {}
      return tokenized if sentence.nil?
      list = @tokenizer.tokenize(sentence)
      iterator = list.iterator
      while iterator.has_next
        item = iterator.next
        tokenized[item.surface_form] = item.send(method)
      end
      tokenized
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
kuromoji-ruby-0.0.4 lib/kuromoji/core.rb
kuromoji-ruby-0.0.3 lib/kuromoji/core.rb