Sha256: f001ae77718cee328fc717aaee4f0f66ec33d75b7d63bff7139ca00629d6677e

Contents?: true

Size: 857 Bytes

Versions: 1

Compression:

Stored size: 857 Bytes

Contents

# -*- encoding: utf-8 -*-

require 'natto'

module RNlp
  # it copes only with Japanese
  class Tokenize
    def tokenize(input)
      natto = Natto::MeCab.new
      # array for token
      token = Array.new
      # make morphological analysis
      natto.parse(input) do |n|
        # word surface and word speech tag
        surface = n.surface
        tag = n.feature.split(',')[0]
        # 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush
        if tag == '助動詞'
          token[token.size-1][0] += surface
        else
          (surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.')
        end
      end
      if token[token.size-1][0] == '。'
        token.pop
      end
      return token
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
r_nlp-0.1.8 lib/r_nlp/tokenize.rb