module StanfordCoreNLP VERSION = '0.2.0' require 'stanford-core-nlp/jar_loader' require 'stanford-core-nlp/java_wrapper' require 'stanford-core-nlp/config' class << self # The path in which to look for the Stanford JAR files, # with a trailing slash. # # The structure of the JAR folder must be as follows: # # Files: # # /stanford-core-nlp.jar # /joda-time.jar # /xom.jar # /bridge.jar* # # Folders: # # /classifiers # Models for the NER system. # /dcoref # Models for the coreference resolver. # /taggers # Models for the POS tagger. # /grammar # Models for the parser. # # *The file bridge.jar is a thin JAVA wrapper over the # Stanford Core NLP get() function, which allows to # retrieve annotations using static classes as names. # This works around one of the lacunae of Rjb. attr_accessor :jar_path # The path to the main folder containing the folders # with the individual models inside. By default, this # is the same as the JAR path. attr_accessor :model_path # The flags for starting the JVM machine. The parser # and named entity recognizer are very memory consuming. attr_accessor :jvm_args # A file to redirect JVM output to. attr_accessor :log_file # The model files for a given language. attr_accessor :model_files end # The default JAR path is the gem's bin folder. self.jar_path = File.dirname(__FILE__) + '/../bin/' # The default model path is the same as the JAR path. self.model_path = self.jar_path # Load the JVM with a minimum heap size of 512MB and a # maximum heap size of 1024MB. self.jvm_args = ['-Xms512M', '-Xmx1024M'] # Turn logging off by default. self.log_file = nil # Use models for a given language. Language can be # supplied as full-length, or ISO-639 2 or 3 letter # code (e.g. :english, :eng or :en will work). def self.use(language) lang = nil self.model_files = {} Config::LanguageCodes.each do |l,codes| lang = codes[2] if codes.include?(language) end Config::Models.each do |n, languages| models = languages[lang] folder = Config::ModelFolders[n] if models.is_a?(Hash) n = n.to_s n += '.model' if n == 'ner' models.each do |m, file| self.model_files["#{n}.#{m}"] = folder + file end elsif models.is_a?(String) self.model_files["#{n}.model"] = folder + models end end end # Use english by default. self.use(:english) # Set a model file. Here are the default models for English: # # 'pos.model' => 'english-left3words-distsim.tagger', # 'ner.model.3class' => 'all.3class.distsim.crf.ser.gz', # 'ner.model.7class' => 'muc.7class.distsim.crf.ser.gz', # 'ner.model.MISCclass' => 'conll.4class.distsim.crf.ser.gz', # 'parser.model' => 'englishPCFG.ser.gz', # 'dcoref.demonym' => 'demonyms.txt', # 'dcoref.animate' => 'animate.unigrams.txt', # 'dcoref.female' => 'female.unigrams.txt', # 'dcoref.inanimate' => 'inanimate.unigrams.txt', # 'dcoref.male' => 'male.unigrams.txt', # 'dcoref.neutral' => 'neutral.unigrams.txt', # 'dcoref.plural' => 'plural.unigrams.txt', # 'dcoref.singular' => 'singular.unigrams.txt', # 'dcoref.states' => 'state-abbreviations.txt', # 'dcoref.extra.gender' => 'namegender.combine.txt' # def self.set_model(name, file) n = name.split('.')[0].intern self.model_files[name] = Config::ModelFolders[n] + file end # Whether the classes are initialized or not. @@initialized = false # Load the JARs, create the classes. def self.init unless @@initialized self.load_jars self.load_default_classes end @@initialized = true end # Load a StanfordCoreNLP pipeline with the # specified JVM flags and StanfordCoreNLP # properties. def self.load(*annotators) self.init unless @@initialized # Prepend the JAR path to the model files. properties = {} self.model_files.each do |k,v| found = false annotators.each do |annotator| found = true if k.index(annotator.to_s) break if found end next unless found f = self.model_path + v puts f unless File.readable?(f) raise "Model file #{f} could not be found. " + "You may need to download this file manually "+ " and/or set paths properly." else properties[k] = f end end properties['annotators'] = annotators.map { |x| x.to_s }.join(', ') CoreNLP.new(get_properties(properties)) end # Once it loads a specific annotator model once, # the program always loads the same models when # you make new pipelines and request the annotator # again, ignoring the changes in models. # # This function kills the JVM and reloads everything # if you need to create a new pipeline with different # models for the same annotators. #def self.reload # raise 'Not implemented.' #end # Load the jars. def self.load_jars JarLoader.log(self.log_file) JarLoader.jvm_args = self.jvm_args JarLoader.jar_path = self.jar_path JarLoader.load('joda-time.jar') JarLoader.load('xom.jar') JarLoader.load('stanford-corenlp.jar') JarLoader.load('bridge.jar') end # Create the Ruby classes corresponding to the StanfordNLP # core classes. def self.load_default_classes const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP') ) self.load_klass 'Annotation' self.load_klass 'Word', 'edu.stanford.nlp.ling' self.load_klass 'MaxentTagger', 'edu.stanford.nlp.tagger.maxent' self.load_klass 'CRFClassifier', 'edu.stanford.nlp.ie.crf' self.load_klass 'Properties', 'java.util' self.load_klass 'ArrayList', 'java.util' self.load_klass 'AnnotationBridge', '' const_set(:Text, Annotation) end # Load a class (e.g. PTBTokenizerAnnotator) in a specific # class path (default is 'edu.stanford.nlp.pipeline'). # The class is then accessible under the StanfordCoreNLP # namespace, e.g. StanfordCoreNLP::PTBTokenizerAnnotator. # # List of annotators: # # - PTBTokenizingAnnotator - tokenizes the text following Penn Treebank conventions. # - WordToSentenceAnnotator - splits a sequence of words into a sequence of sentences. # - POSTaggerAnnotator - annotates the text with part-of-speech tags. # - MorphaAnnotator - morphological normalizer (generates lemmas). # - NERAnnotator - annotates the text with named-entity labels. # - NERCombinerAnnotator - combines several NER models (use this instead of NERAnnotator!). # - TrueCaseAnnotator - detects the true case of words in free text (useful for all upper or lower case text). # - ParserAnnotator - generates constituent and dependency trees. # - NumberAnnotator - recognizes numerical entities such as numbers, money, times, and dates. # - TimeWordAnnotator - recognizes common temporal expressions, such as "teatime". # - QuantifiableEntityNormalizingAnnotator - normalizes the content of all numerical entities. # - SRLAnnotator - annotates predicates and their semantic roles. # - CorefAnnotator - implements pronominal anaphora resolution using a statistical model (deprecated!). # - DeterministicCorefAnnotator - implements anaphora resolution using a deterministic model (newer model, use this!). # - NFLAnnotator - implements entity and relation mention extraction for the NFL domain. def self.load_class(klass, base = 'edu.stanford.nlp.pipeline') self.init unless @@initialized self.load_klass(klass, base) end # HCreate a java.util.Properties object from a hash. def self.get_properties(properties) props = Properties.new properties.each do |property, value| props.set_property(property, value) end props end # Get a Java ArrayList binding to pass lists # of tokens to the Stanford Core NLP process. def self.get_list(tokens) list = StanfordCoreNLP::ArrayList.new tokens.each do |t| list.add(StanfordCoreNLP::Word.new(t.to_s)) end list end # Under_case -> CamelCase. def self.camel_case(text) text.to_s.gsub(/^[a-z]|_[a-z]/) do |a| a.upcase end.gsub('_', '') end private def self.load_klass(klass, base = 'edu.stanford.nlp.pipeline') base += '.' unless base == '' const_set(klass.intern, Rjb::import("#{base}#{klass}")) end end