module StanfordCoreNLP class Config # A hash of language codes in humanized, # 2 and 3-letter ISO639 codes. LanguageCodes = { :english => [:en, :eng, :english], :german => [:de, :ger, :german], :french => [:fr, :fre, :french] } # Folders inside the JAR path for the models. ModelFolders = { :pos => 'taggers/', :parse => 'grammar/', :ner => 'classifiers/', :dcoref => 'dcoref/' } # Tag sets used by Stanford for each language. TagSets = { :english => :penn, :german => :stutgart, :french => :paris7 } # Default models for all languages. Models = { :pos => { :english => 'english-left3words-distsim.tagger', :german => 'german-fast.tagger', :french => 'french.tagger' }, :parse => { :english => 'englishPCFG.ser.gz', :german => 'germanPCFG.ser.gz', :french => 'frenchFactored.ser.gz' }, :ner => { :english => { '3class' => 'all.3class.distsim.crf.ser.gz', '7class' => 'muc.7class.distsim.crf.ser.gz', 'MISCclass' => 'conll.4class.distsim.crf.ser.gz' }, :german => {}, :french => {} }, :dcoref => { :english => { 'demonym' => 'demonyms.txt', 'animate' => 'animate.unigrams.txt', 'female' => 'female.unigrams.txt', 'inanimate' => 'inanimate.unigrams.txt', 'male' => 'male.unigrams.txt', 'neutral' => 'neutral.unigrams.txt', 'plural' => 'plural.unigrams.txt', 'singular' => 'singular.unigrams.txt', 'states' => 'state-abbreviations.txt', 'countries' => 'countries', 'states.provinces' => 'statesandprovinces', 'extra.gender' => 'namegender.combine.txt' }, :german => {}, :french => {} } # Models to add. #"truecase.model" - path towards the true-casing model; default: StanfordCoreNLPModels/truecase/noUN.ser.gz #"truecase.bias" - class bias of the true case model; default: INIT_UPPER:-0.7,UPPER:-0.7,O:0 #"truecase.mixedcasefile" - path towards the mixed case file; default: StanfordCoreNLPModels/truecase/MixDisambiguation.list #"nfl.gazetteer" - path towards the gazetteer for the NFL domain #"nfl.relation.model" - path towards the NFL relation extraction model } # List of annotations by JAVA class path. Annotations = { 'nlp.dcoref.CoNLL2011DocumentReader' => [ 'CorefMentionAnnotation', 'NamedEntityAnnotation' ], 'nlp.ling.CoreAnnotations' => [ 'AbbrAnnotation', 'AbgeneAnnotation', 'AbstrAnnotation', 'AfterAnnotation', 'AnswerAnnotation', 'AnswerObjectAnnotation', 'AntecedentAnnotation', 'ArgDescendentAnnotation', 'ArgumentAnnotation', 'BagOfWordsAnnotation', 'BeAnnotation', 'BeforeAnnotation', 'BeginIndexAnnotation', 'BestCliquesAnnotation', 'BestFullAnnotation', 'CalendarAnnotation', 'CategoryAnnotation', 'CategoryFunctionalTagAnnotation', 'CharacterOffsetBeginAnnotation', 'CharacterOffsetEndAnnotation', 'CharAnnotation', 'ChineseCharAnnotation', 'ChineseIsSegmentedAnnotation', 'ChineseOrigSegAnnotation', 'ChineseSegAnnotation', 'ChunkAnnotation', 'CoarseTagAnnotation', 'CommonWordsAnnotation', 'CoNLLDepAnnotation', 'CoNLLDepParentIndexAnnotation', 'CoNLLDepTypeAnnotation', 'CoNLLPredicateAnnotation', 'CoNLLSRLAnnotation', 'ContextsAnnotation', 'CopyAnnotation', 'CostMagnificationAnnotation', 'CovertIDAnnotation', 'D2_LBeginAnnotation', 'D2_LEndAnnotation', 'D2_LMiddleAnnotation', 'DayAnnotation', 'DependentsAnnotation', 'DictAnnotation', 'DistSimAnnotation', 'DoAnnotation', 'DocDateAnnotation', 'DocIDAnnotation', 'DomainAnnotation', 'EndIndexAnnotation', 'EntityClassAnnotation', 'EntityRuleAnnotation', 'EntityTypeAnnotation', 'FeaturesAnnotation', 'FemaleGazAnnotation', 'FirstChildAnnotation', 'ForcedSentenceEndAnnotation', 'FreqAnnotation', 'GazAnnotation', 'GazetteerAnnotation', 'GenericTokensAnnotation', 'GeniaAnnotation', 'GoldAnswerAnnotation', 'GovernorAnnotation', 'GrandparentAnnotation', 'HaveAnnotation', 'HeadWordStringAnnotation', 'HeightAnnotation', 'IDAnnotation', 'IDFAnnotation', 'INAnnotation', 'IndexAnnotation', 'InterpretationAnnotation', 'IsDateRangeAnnotation', 'IsURLAnnotation', 'LabelAnnotation', 'LastGazAnnotation', 'LastTaggedAnnotation', 'LBeginAnnotation', 'LeftChildrenNodeAnnotation', 'LeftTermAnnotation', 'LemmaAnnotation', 'LEndAnnotation', 'LengthAnnotation', 'LMiddleAnnotation', 'MaleGazAnnotation', 'MarkingAnnotation', 'MonthAnnotation', 'MorphoCaseAnnotation', 'MorphoGenAnnotation', 'MorphoNumAnnotation', 'MorphoPersAnnotation', 'NamedEntityTagAnnotation', 'NeighborsAnnotation', 'NERIDAnnotation', 'NormalizedNamedEntityTagAnnotation', 'NotAnnotation', 'NumericCompositeObjectAnnotation', 'NumericCompositeTypeAnnotation', 'NumericCompositeValueAnnotation', 'NumericObjectAnnotation', 'NumericTypeAnnotation', 'NumericValueAnnotation', 'NumerizedTokensAnnotation', 'NumTxtSentencesAnnotation', 'OriginalAnswerAnnotation', 'OriginalCharAnnotation', 'OriginalTextAnnotation', 'ParagraphAnnotation', 'ParagraphsAnnotation', 'ParaPositionAnnotation', 'ParentAnnotation', 'PartOfSpeechAnnotation', 'PercentAnnotation', 'PhraseWordsAnnotation', 'PhraseWordsTagAnnotation', 'PolarityAnnotation', 'PositionAnnotation', 'PossibleAnswersAnnotation', 'PredictedAnswerAnnotation', 'PrevChildAnnotation', 'PriorAnnotation', 'ProjectedCategoryAnnotation', 'ProtoAnnotation', 'RoleAnnotation', 'SectionAnnotation', 'SemanticHeadTagAnnotation', 'SemanticHeadWordAnnotation', 'SemanticTagAnnotation', 'SemanticWordAnnotation', 'SentenceIDAnnotation', 'SentenceIndexAnnotation', 'SentencePositionAnnotation', 'SentencesAnnotation', 'ShapeAnnotation', 'SpaceBeforeAnnotation', 'SpanAnnotation', 'SpeakerAnnotation', 'SRL_ID', 'SRLIDAnnotation', 'SRLInstancesAnnotation', 'StackedNamedEntityTagAnnotation', 'StateAnnotation', 'StemAnnotation', 'SubcategorizationAnnotation', 'TagLabelAnnotation', 'TextAnnotation', 'TokenBeginAnnotation', 'TokenEndAnnotation', 'TokensAnnotation', 'TopicAnnotation', 'TrueCaseAnnotation', 'TrueCaseTextAnnotation', 'TrueTagAnnotation', 'UBlockAnnotation', 'UnaryAnnotation', 'UnknownAnnotation', 'UtteranceAnnotation', 'UTypeAnnotation', 'ValueAnnotation', 'VerbSenseAnnotation', 'WebAnnotation', 'WordFormAnnotation', 'WordnetSynAnnotation', 'WordPositionAnnotation', 'WordSenseAnnotation', 'XmlContextAnnotation', 'XmlElementAnnotation', 'YearAnnotation' ], 'nlp.dcoref.CorefCoreAnnotations' => [ 'CorefAnnotation', 'CorefChainAnnotation', 'CorefClusterAnnotation', 'CorefClusterIdAnnotation', 'CorefDestAnnotation', 'CorefGraphAnnotation' ], 'nlp.ling.CoreLabel' => [ 'GenericAnnotation' ], 'nlp.trees.EnglishGrammaticalRelations' => [ 'AbbreviationModifierGRAnnotation', 'AdjectivalComplementGRAnnotation', 'AdjectivalModifierGRAnnotation', 'AdvClauseModifierGRAnnotation', 'AdverbialModifierGRAnnotation', 'AgentGRAnnotation', 'AppositionalModifierGRAnnotation', 'ArgumentGRAnnotation', 'AttributiveGRAnnotation', 'AuxModifierGRAnnotation', 'AuxPassiveGRAnnotation', 'ClausalComplementGRAnnotation', 'ClausalPassiveSubjectGRAnnotation', 'ClausalSubjectGRAnnotation', 'ComplementGRAnnotation', 'ComplementizerGRAnnotation', 'ConjunctGRAnnotation', 'ControllingSubjectGRAnnotation', 'CoordinationGRAnnotation', 'CopulaGRAnnotation', 'DeterminerGRAnnotation', 'DirectObjectGRAnnotation', 'ExpletiveGRAnnotation', 'IndirectObjectGRAnnotation', 'InfinitivalModifierGRAnnotation', 'MarkerGRAnnotation', 'ModifierGRAnnotation', 'MultiWordExpressionGRAnnotation', 'NegationModifierGRAnnotation', 'NominalPassiveSubjectGRAnnotation', 'NominalSubjectGRAnnotation', 'NounCompoundModifierGRAnnotation', 'NpAdverbialModifierGRAnnotation', 'NumberModifierGRAnnotation', 'NumericModifierGRAnnotation', 'ObjectGRAnnotation', 'ParataxisGRAnnotation', 'ParticipialModifierGRAnnotation', 'PhrasalVerbParticleGRAnnotation', 'PossessionModifierGRAnnotation', 'PossessiveModifierGRAnnotation', 'PreconjunctGRAnnotation', 'PredeterminerGRAnnotation', 'PredicateGRAnnotation', 'PrepositionalComplementGRAnnotation', 'PrepositionalModifierGRAnnotation', 'PrepositionalObjectGRAnnotation', 'PunctuationGRAnnotation', 'PurposeClauseModifierGRAnnotation', 'QuantifierModifierGRAnnotation', 'ReferentGRAnnotation', 'RelativeClauseModifierGRAnnotation', 'RelativeGRAnnotation', 'SemanticDependentGRAnnotation', 'SubjectGRAnnotation', 'TemporalModifierGRAnnotation', 'XClausalComplementGRAnnotation' ], 'nlp.trees.GrammaticalRelation' => [ 'DependentGRAnnotation', 'GovernorGRAnnotation', 'GrammaticalRelationAnnotation', 'KillGRAnnotation', 'Language', 'RootGRAnnotation' ], 'nlp.ie.machinereading.structure.MachineReadingAnnotations' => [ 'DependencyAnnotation', 'DocumentDirectoryAnnotation', 'DocumentIdAnnotation', 'EntityMentionsAnnotation', 'EventMentionsAnnotation', 'GenderAnnotation', 'RelationMentionsAnnotation', 'TriggerAnnotation' ], 'nlp.parser.lexparser.ParserAnnotations' => [ 'ConstraintAnnotation' ], 'nlp.trees.semgraph.SemanticGraphCoreAnnotations' => [ 'BasicDependenciesAnnotation', 'CollapsedCCProcessedDependenciesAnnotation', 'CollapsedDependenciesAnnotation' ], 'nlp.time.TimeAnnotations' => [ 'TimexAnnotation', 'TimexAnnotations' ], 'nlp.time.TimeExpression' => [ 'Annotation', 'ChildrenAnnotation' ], 'nlp.trees.TreeCoreAnnotations' => [ 'TreeHeadTagAnnotation', 'TreeHeadWordAnnotation', 'TreeAnnotation' ] } # Create a list of annotation names => paths. annotations_by_name = {} Annotations.each do |base_class, annotation_classes| annotation_classes.each do |annotation_class| annotations_by_name[annotation_class] ||= [] annotations_by_name[annotation_class] << base_class end end # Hash of name => path. AnnotationsByName = annotations_by_name end end