require 'stanford-core-nlp/config' module StanfordCoreNLP VERSION = '0.5.1' require 'bind-it' extend BindIt::Binding # ############################ # # BindIt Configuration Options # # ############################ # # The default path for the JAR files # is the gem's bin folder. self.jar_path = File.dirname(__FILE__).gsub(/\/lib\z/, '') + '/bin/' # Default namespace is the Stanford pipeline namespace. self.default_namespace = 'edu.stanford.nlp.pipeline' # Load the JVM with a minimum heap size of 512MB, # and a maximum heap size of 1024MB. StanfordCoreNLP.jvm_args = ['-Xms512M', '-Xmx1024M'] # Turn logging off by default. StanfordCoreNLP.log_file = nil # Default JAR files to load. StanfordCoreNLP.default_jars = [ 'joda-time.jar', 'xom.jar', 'stanford-parser.jar', 'stanford-corenlp.jar', 'stanford-segmenter.jar', 'jollyday.jar', 'bridge.jar' ] # Default classes to load. StanfordCoreNLP.default_classes = [ ['StanfordCoreNLP', 'edu.stanford.nlp.pipeline', 'CoreNLP'], ['Annotation', 'edu.stanford.nlp.pipeline'], ['Word', 'edu.stanford.nlp.ling'], ['CoreLabel', 'edu.stanford.nlp.ling'], ['MaxentTagger', 'edu.stanford.nlp.tagger.maxent'], ['CRFClassifier', 'edu.stanford.nlp.ie.crf'], ['LexicalizedParser', 'edu.stanford.nlp.parser.lexparser'], ['Options', 'edu.stanford.nlp.parser.lexparser'], ['Properties', 'java.util'], ['ArrayList', 'java.util'], ['AnnotationBridge', ''] ] # ########################### # # Stanford Core NLP bindings # # ########################### # require 'stanford-core-nlp/bridge' extend StanfordCoreNLP::Bridge class << self # The model file names for a given language. attr_accessor :model_files # The folder in which to look for models. attr_accessor :model_path # Store the language currently being used. attr_accessor :language end # The path to the main folder containing the folders # with the individual models inside. By default, this # is the same as the JAR path. self.model_path = self.jar_path # ########################### # # Public configuration params # # ########################### # # Use models for a given language. Language can be # supplied as full-length, or ISO-639 2 or 3 letter # code (e.g. :english, :eng or :en will work). def self.use(language) lang = nil self.model_files = {} Config::LanguageCodes.each do |l,codes| lang = codes[2] if codes.include?(language) end self.language = lang Config::Models.each do |n, languages| models = languages[lang] folder = Config::ModelFolders[n] if models.is_a?(Hash) n = n.to_s models.each do |m, file| self.model_files["#{n}.#{m}"] = folder + file end elsif models.is_a?(String) self.model_files["#{n}.model"] = folder + models end end end # Use english by default. self.use :english # Set a model file. def self.set_model(name, file) n = name.split('.')[0].intern self.model_files[name] = Config::ModelFolders[n] + file end # ########################### # # Public API methods # # ########################### # def self.bind # Take care of Windows users. if self.running_on_windows? self.jar_path.gsub!('/', '\\') self.model_path.gsub!('/', '\\') end # Make the bindings. super # Bind annotation bridge. self.default_classes.each do |info| klass = const_get(info.first) self.inject_get_method(klass) end end # Load a StanfordCoreNLP pipeline with the # specified JVM flags and StanfordCoreNLP # properties. def self.load(*annotators) self.bind unless self.bound # Prepend the JAR path to the model files. properties = {} self.model_files.each do |k,v| found = false annotators.each do |annotator| found = true if k.index(annotator.to_s) break if found end next unless found f = self.model_path + v unless File.readable?(f) raise "Model file #{f} could not be found. " + "You may need to download this file manually " + "and/or set paths properly." end properties[k] = f end properties['annotators'] = annotators.map { |x| x.to_s }.join(', ') unless self.language == :english # Bug fix for French/German parsers. # Otherwise throws "IllegalArgumentException: # Unknown option: -retainTmpSubcategories" properties['parse.flags'] = '' # Bug fix for French/German parsers. # Otherswise throws java.lang.NullPointerException: null. properties['parse.buildgraphs'] = 'false' end # Bug fix for NER system. Otherwise throws: # Error initializing binder 1 at edu.stanford. # nlp.time.Options.(Options.java:88) properties['sutime.binders'] = '0' # Manually include SUTime models. if annotators.include?(:ner) properties['sutime.rules'] = self.model_path + 'sutime/defs.sutime.txt, ' + self.model_path + 'sutime/english.sutime.txt' end props = get_properties(properties) # Hack for Java7 compatibility. bridge = const_get(:AnnotationBridge) bridge.getPipelineWithProperties(props) end # Hack in order not to break backwards compatibility. def self.const_missing(const) if const == :Text puts "WARNING: StanfordCoreNLP::Text has been deprecated." + "Please use StanfordCoreNLP::Annotation instead." Annotation else super(const) end end private # Create a java.util.Properties object from a hash. def self.get_properties(properties) props = Properties.new properties.each do |property, value| props.set_property(property, value) end props end # Get a Java ArrayList binding to pass lists # of tokens to the Stanford Core NLP process. def self.get_list(tokens) list = StanfordCoreNLP::ArrayList.new tokens.each do |t| list.add(Word.new(t.to_s)) end list end # Returns true if we're running on Windows. def self.running_on_windows? RUBY_PLATFORM.split("-")[1] == 'mswin32' end # camel_case which also support dot as separator def self.camel_case(s) s = s.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" } s.gsub(/(?:^|_|\.)(.)/) { $1.upcase } end end