Sha256: f001ae77718cee328fc717aaee4f0f66ec33d75b7d63bff7139ca00629d6677e
Contents?: true
Size: 857 Bytes
Versions: 1
Compression:
Stored size: 857 Bytes
Contents
# -*- encoding: utf-8 -*- require 'natto' module RNlp # it copes only with Japanese class Tokenize def tokenize(input) natto = Natto::MeCab.new # array for token token = Array.new # make morphological analysis natto.parse(input) do |n| # word surface and word speech tag surface = n.surface tag = n.feature.split(',')[0] # 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush if tag == '助動詞' token[token.size-1][0] += surface else (surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.') end end if token[token.size-1][0] == '。' token.pop end return token end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
r_nlp-0.1.8 | lib/r_nlp/tokenize.rb |