Sha256: 68221c053f7bf326a796a885b9a1aa50bc0d2528dfeb292b5d534fc2511d4ace
Contents?: true
Size: 1.49 KB
Versions: 3
Compression:
Stored size: 1.49 KB
Contents
module OpenNlp class Chunker < Tool self.java_class = Java::opennlp.tools.chunker.ChunkerME def initialize(model, token_model, pos_model) super(model) raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer) raise ArgumentError, "model must be an OpenNlp::POSTagger::Model" unless pos_model.is_a?(Model::POSTagger) @tokenizer = Tokenizer.new(token_model) @pos_tagger = POSTagger.new(pos_model) end def chunk(str) raise ArgumentError, "str must be a String" unless str.is_a?(String) tokens = @tokenizer.tokenize(str) pos_tags = @pos_tagger.tag(tokens).to_ary chunks = @j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary build_chunks(chunks, tokens, pos_tags) end private def build_chunks(chunks, tokens, pos_tags) # data[i] = [token, pos_tag, chunk_val] data = tokens.zip(pos_tags, chunks) data.inject([]) do |acc, val| chunk = val[2] acc << [{val[0] => val[1]}] if chunk[0] == 'B' # add token to chunk if it is a start of chunk if chunk[0] == 'I' if acc.last acc.last << {val[0] => val[1]} # add token to chunk if it is a continuation of chunk else acc << [{val[0] => val[1]}] # add token to new chunk if no chunks exists end end acc end end def get_last_probabilities @j_instance.probs.to_ary end end end
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
open_nlp-0.0.7-java | lib/open_nlp/chunker.rb |
open_nlp-0.0.6-java | lib/open_nlp/chunker.rb |
open_nlp-0.0.5-java | lib/open_nlp/chunker.rb |