module StanfordCoreNLP VERSION = '0.1.1' require 'stanford-core-nlp/jar-loader.rb' require 'stanford-core-nlp/java-wrapper.rb' class << self # The path in which to look for the Stanford JAR files. # This is passed to JarLoader. attr_accessor :jar_path # The flags for starting the JVM machine. # Parser and named entity recognizer are very memory consuming. attr_accessor :jvm_args # A file to redirect JVM output to. attr_accessor :log_file # The model files. Use #set_model to modify these. attr_accessor :model_files end # The default JAR path is the gem's bin folder. self.jar_path = File.dirname(__FILE__) + '/../bin/' # Load the JVM with a minimum heap size of 512MB and a # maximum heap size of 1024MB. self.jvm_args = ['-Xms512M', '-Xmx1024M'] # Turn logging off by default. self.log_file = nil # Default model files. self.model_files = { 'pos.model' => 'taggers/english-left3words-distsim.tagger', 'ner.model.3class' => 'classifiers/all.3class.distsim.crf.ser.gz', 'ner.model.7class' => 'classifiers/muc.7class.distsim.crf.ser.gz', 'ner.model.MISCclass' => 'classifiers/conll.4class.distsim.crf.ser.gz', 'parser.model' => 'grammar/englishPCFG.ser.gz', 'dcoref.demonym' => 'dcoref/demonyms.txt', 'dcoref.animate' => 'dcoref/animate.unigrams.txt', 'dcoref.female' => 'dcoref/female.unigrams.txt', 'dcoref.inanimate' => 'dcoref/inanimate.unigrams.txt', 'dcoref.male' => 'dcoref/male.unigrams.txt', 'dcoref.neutral' => 'dcoref/neutral.unigrams.txt', 'dcoref.plural' => 'dcoref/plural.unigrams.txt', 'dcoref.singular' => 'dcoref/singular.unigrams.txt', 'dcoref.states' => 'dcoref/state-abbreviations.txt', 'dcoref.countries' => 'dcoref/unknown.txt', # Fix - can somebody provide this file? 'dcoref.states.provinces' => 'dcoref/unknown.txt', # Fix - can somebody provide this file? 'dcoref.extra.gender' => 'dcoref/namegender.combine.txt' } # Set a model file. def self.set_model(name, file) self.model_files[name] = file end # Load a StanfordCoreNLP pipeline with the specified JVM flags and # StanfordCoreNLP properties (hash of property => values). def self.load(*annotators) self.load_jars(self.jvm_args, self.jar_path, self.log_file) self.create_classes # Prepend the JAR path to the model files. properties = {} self.model_files.each { |k,v| properties[k] = self.jar_path + v } properties['annotators'] = annotators.map { |x| x.to_s }.join(', ') CoreNLP.new(get_properties(properties)) end # Load the jars. def self.load_jars(jvm_args, jar_path, log_file) JarLoader.jvm_args = jvm_args JarLoader.jar_path = jar_path JarLoader.log(log_file) if log_file JarLoader.load('joda-time.jar') JarLoader.load('xom.jar') JarLoader.load('stanford-corenlp.jar') JarLoader.load('bridge.jar') end # Create the Ruby classes corresponding to the StanfordNLP # core classes. def self.create_classes const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP')) const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation')) const_set(:Text, Annotation) # A more intuitive alias. const_set(:Properties, Rjb::import('java.util.Properties')) const_set(:AnnotationBridge, Rjb::import('AnnotationBridge')) end # Create a java.util.Properties object from a hash. def self.get_properties(properties) props = Properties.new properties.each do |property, value| props.set_property(property, value) end props end end