Sha256: 67ac8018015231a4e210853fcce3d101381d26e761ac188bb0b34e0af1febdf7

Contents?: true

Size: 1.5 KB

Versions: 1

Compression:

Stored size: 1.5 KB

Contents

require 'rbbt'
require 'rjb'
require 'rbbt/ner/annotations'
require 'rbbt/ner/NER'

# Offers a Ruby interface to the Abner Named Entity Recognition Package
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
class Abner < NER

  Rbbt.software.opt.ABNER.define_as_install Rbbt.share.install.software.ABNER.find

  @@JFile   = Rjb::import('java.io.File')
  @@Tagger  = Rjb::import('abner.Tagger')
  @@Trainer = Rjb::import('abner.Trainer')

  # If modelfile is present a custom trained model can be used,
  # otherwise, the default BioCreative model is used.
  def initialize(modelfile=nil)
    if modelfile == nil         
      @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
    else                
      @tagger = @@Tagger.new(@@JFile.new(modelfile))
    end
  end

  # Given a chunk of text, it finds all the mentions appearing in it. It
  # returns all the mentions found, regardless of type, to be coherent
  # with the rest of NER packages in Rbbt.
  def match(text)
    return [] if text.nil? or text.empty?

    res = @tagger.getEntities(text)
    types = res[1]
    strings = res[0]

    global_offset = 0
    strings.zip(types).collect do |mention, type| 
      mention = mention.to_s; 
      offset = text.index(mention)
      if offset.nil?
        NamedEntity.annotate(mention, nil, type.to_s)
      else
        NamedEntity.annotate(mention, offset + global_offset, type.to_s)
        text = text[offset + mention.length..-1]
        global_offset += offset + mention.length
      end

      mention
    end
  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
rbbt-text-0.5.0 lib/rbbt/ner/abner.rb