Sha256: 7cd9775e62906a67949f084b1e6570e176d5adcb35d23688fe901edbf2b9e595
Contents?: true
Size: 1.69 KB
Versions: 2
Compression:
Stored size: 1.69 KB
Contents
module OpenNlp class Chunker < Tool self.java_class = Java::opennlp.tools.chunker.ChunkerME def initialize(model, token_model, pos_model) super(model) unless token_model.is_a?(Model::Tokenizer) fail ArgumentError, 'token model must be an OpenNlp::Tokenizer::Model' end unless pos_model.is_a?(Model::POSTagger) fail ArgumentError, 'pos model must be an OpenNlp::POSTagger::Model' end @tokenizer = Tokenizer.new(token_model) @pos_tagger = POSTagger.new(pos_model) end # Chunks a string into part-of-sentence pieces # # @param [String] str string to chunk # @return [Array] array of chunks with part-of-sentence information def chunk(str) fail ArgumentError, 'str must be a String' unless str.is_a?(String) tokens = tokenizer.tokenize(str) pos_tags = pos_tagger.tag(tokens).to_ary chunks = j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary build_chunks(chunks, tokens, pos_tags) end private attr_reader :tokenizer, :pos_tagger def build_chunks(chunks, tokens, pos_tags) data = tokens.zip(pos_tags, chunks) data.inject([]) do |acc, val| chunk = val[2] acc << [{val[0] => val[1]}] if chunk[0] == 'B' # add token to chunk if it is a start of chunk if chunk[0] == 'I' if acc.last acc.last << {val[0] => val[1]} # add token to chunk if it is a continuation of chunk else acc << [{val[0] => val[1]}] # add token to new chunk if no chunks exists end end acc end end def get_last_probabilities j_instance.probs.to_ary end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
open_nlp-0.2.0-java | lib/open_nlp/chunker.rb |
open_nlp-0.1.0-java | lib/open_nlp/chunker.rb |