#! /local/ruby/bin/ruby $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger') require 'rubygems' require 'kconv' require 'porter' # use hpricot for extracting English text from docs with XML like tags begin require 'hpricot' rescue LoadError $no_hpricot = true end # File paths $lexpath = File.join(File.dirname(__FILE__), 'engtagger') $word_path = File.join($lexpath, "pos_words.hash") $tag_path = File.join($lexpath, "pos_tags.hash") # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization) class Module def memoize(method) # alias_method is faster than define_method + old.bind(self).call alias_method "__memoized__#{method}", method module_eval <<-EOF def #{method}(*a, &b) # assumes the block won't change the result if the args are the same (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b) end EOF end end # English part-of-speech tagger class class EngTagger VERSION = '0.1.1' ################# # Class methods # ################# # Return a class variable that holds probability data def self.hmm return @@hmm end # Return a class variable that holds lexical data def self.lexicon return @@lexicon end # Return a regexp from a string argument that matches an XML-style pos tag def self.get_ext(tag = nil) return nil unless tag return Regexp.new("<#{tag}>[^<]+\s*") end # Regexps to match XML-style part-of-speech tags NUM = get_ext('cd') GER = get_ext('vbg') ADJ = get_ext('jj[rs]*') PART = get_ext('vbn') NN = get_ext('nn[sp]*') NNP = get_ext('nnp') PREP = get_ext('in') DET = get_ext('det') PAREN = get_ext('[lr]rb') QUOT = get_ext('ppr') SEN = get_ext('pp') WORD = get_ext('\w+') # Convert a Treebank-style, abbreviated tag into verbose definitions def self.explain_tag(tag) if TAGS[tag] return TAGS[tag] else return tag end end # The folloging is to make a hash to convert a pos tag to its definition # used by the explain_tag method tags = [ "CC", "Conjunction, coordinating", "CD", "Adjective, cardinal number", "DET", "Determiner", "EX", "Pronoun, existential there", "FW", "Foreign words", "IN", "Preposition / Conjunction", "JJ", "Adjective", "JJR", "Adjective, comparative", "JJS", "Adjective, superlative", "LS", "Symbol, list item", "MD", "Verb, modal", "NN", "Noun", "NNP", "Noun, proper", "NNPS", "Noun, proper, plural", "NNS", "Noun, plural", "PDT", "Determiner, prequalifier", "POS", "Possessive", "PRP", "Determiner, possessive second", "PRPS", "Determiner, possessive", "RB", "Adverb", "RBR", "Adverb, comparative", "RBS", "Adverb, superlative", "RP", "Adverb, particle", "SYM", "Symbol", "TO", "Preposition", "UH", "Interjection", "VB", "Verb, infinitive", "VBD", "Verb, past tense", "VBG", "Verb, gerund", "VBN", "Verb, past/passive participle", "VBP", "Verb, base present form", "VBZ", "Verb, present 3SG -s form", "WDT", "Determiner, question", "WP", "Pronoun, question", "WPS", "Determiner, possessive & question", "WRB", "Adverb, question", "PP", "Punctuation, sentence ender", "PPC", "Punctuation, comma", "PPD", "Punctuation, dollar sign", "PPL", "Punctuation, quotation mark left", "PPR", "Punctuation, quotation mark right", "PPS", "Punctuation, colon, semicolon, elipsis", "LRB", "Punctuation, left bracket", "RRB", "Punctuation, right bracket" ] tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')} tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")} TAGS = Hash[*tags] # Hash storing config values: # # * :unknown_word_tag # => (String) Tag to assign to unknown words # * :stem # => (Boolean) Stem single words using Porter module # * :weight_noun_phrases # => (Boolean) When returning occurrence counts for a noun phrase, multiply # the valuethe number of words in the NP. # * :longest_noun_phrase # => (Integer) Will ignore noun phrases longer than this threshold. This # affects only the get_words() and get_nouns() methods. # * :relax # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for # uncommon words, particularly words used polysemously # * :tag_lex # => (String) Name of the YAML file containing a hash of adjacent part of # speech tags and the probability of each # * :word_lex # => (String) Name of the YAML file containing a hash of words and corresponding # parts of speech # * :unknown_lex # => (String) Name of the YAML file containing a hash of tags for unknown # words and corresponding parts of speech # * :tag_path # => (String) Directory path of tag_lex # * :word_path # => (String) Directory path of word_lex and unknown_lex # * :debug # => (Boolean) Print debug messages attr_accessor :conf ############### # Constructor # ############### # Take a hash of parameters that override default values. # See above for details. def initialize(params = {}) @conf = Hash.new @conf[:unknown_word_tag] = '' @conf[:stem] = false @conf[:weight_noun_phrases] = false @conf[:longest_noun_phrase] = 5 @conf[:relax] = false @conf[:tag_lex] = 'tags.yml' @conf[:word_lex] = 'words.yml' @conf[:unknown_lex] = 'unknown.yml' @conf[:word_path] = $word_path @conf[:tag_path] = $tag_path @conf[:debug] = false # assuming that we start analyzing from the beginninga new sentence... @conf[:current_tag] = 'pp' @conf.merge(params) if params unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path]) print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug] @@hmm = Hash.new @@lexicon = Hash.new else lexf = File.open(@conf[:word_path], 'r') @@lexicon = Marshal.load(lexf) lexf.close hmmf = File.open(@conf[:tag_path], 'r') @@hmm = Marshal.load(hmmf) hmmf.close end @@mnp = get_max_noun_regex end ################## # Public methods # ################## # Examine the string provided and return it fully tagged in XML style def add_tags(text, verbose = false) return nil unless valid_text(text) tagged = [] words = clean_text(text) tags = Array.new words.each do |word| cleaned_word = clean_word(word) tag = assign_tag(@conf[:current_tag], cleaned_word) @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn' tag = EngTagger.explain_tag(tag) if verbose tagged << '<' + tag + '>' + word + '' end reset return tagged.join(' ') end # Given a text string, return as many nouns and noun phrases as possible. # Applies add_tags and involves three stages: # # * Tag the text # * Extract all the maximal noun phrases # * Recursively extract all noun phrases from the MNPs # def get_words(text) return false unless valid_text(text) tagged = add_tags(text) if(@conf[:longest_noun_phrase] <= 1) return get_nouns(tagged) else return get_noun_phrases(tagged) end end # Return an easy-on-the-eyes tagged version of a text string. # Applies add_tags and reformats to be easier to read. def get_readable(text, verbose = false) return nil unless valid_text(text) tagged = add_tags(text, verbose) tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do $1 + '/' + $2.upcase end return tagged end # Return an array of sentences (without POS tags) from a text. def get_sentences(text) return nil unless valid_text(text) tagged = add_tags(text) sentences = Array.new tagged.split(/<\/pp>/).each do |line| sentences << strip_tags(line) end sentences = sentences.map do |sentence| sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '} sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '} sentence.gsub(Regexp.new(" (`+) ")){' ' + $1} sentence.gsub(Regexp.new(" (\W+)$")){$1} sentence.gsub(Regexp.new("^(`+) ")){$1} end return sentences end # Given a POS-tagged text, this method returns a hash of all proper nouns # and their occurrence frequencies. The method is greedy and will # return multi-word phrases, if possible, so it would find ``Linguistic # Data Consortium'' as a single unit, rather than as three individual # proper nouns. This method does not stem the found words. def get_proper_nouns(tagged) return nil unless valid_text(tagged) trimmed = tagged.scan(NNP).map do |n| strip_tags(n) end nnp = Hash.new(0) trimmed.each do |n| next unless n.length < 100 # sanity check on word length nnp[n] += 1 unless n =~ /\A\s*\z/ end # Now for some fancy resolution stuff... nnp.keys.each do |key| words = key.split(/\s/) # Let's say this is an organization's name -- # (and it's got at least three words) # is there a corresponding acronym in this hash? if words.length > 2 # Make a (naive) acronym out of this name acronym = words.map do |word| /\A([a-z])[a-z]*\z/ =~ word $1 end.join '' # If that acronym has been seen, # remove it and add the values to # the full name if nnp[acronym] nnp[key] += nnp[acronym] nnp.delete(acronym) end end end return nnp end # Given a POS-tagged text, this method returns all nouns and their # occurrence frequencies. def get_nouns(tagged) return nil unless valid_text(tagged) NN trimmed = tagged.scan(NN).map do |n| strip_tags(n) end ret = Hash.new(0) trimmed.each do |n| n = stem(n) next unless n.length < 100 # sanity check on word length ret[n] += 1 unless n =~ /\A\s*\z/ end return ret end # Given a POS-tagged text, this method returns only the maximal noun phrases. # May be called directly, but is also used by get_noun_phrases def get_max_noun_phrases(tagged) return unless valid_text(tagged) mn_phrases = tagged.scan(@@mnp).map do |m| strip_tags(m) end ret = Hash.new(0) mn_phrases.each do |p| p = stem(p) unless p =~ /\s/ # stem single words ret[p] += 1 unless p =~ /\A\s*\z/ end return ret end # Similar to get_words, but requires a POS-tagged text as an argument. def get_noun_phrases(tagged) return nil unless valid_text(tagged) found = Hash.new(0) phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo scanned = tagged.scan(@@mnp) # Find MNPs in the text, one sentence at a time # Record and split if the phrase is extended by a (?:PREP|DET|NUM) mn_phrases = [] scanned.each do |m| found[m] += 1 if phrase_ext =~ m mn_phrases += m.split(phrase_ext) end mn_phrases.each do |mnp| # Split the phrase into an array of words, and create a loop for each word, # shortening the phrase by removing the word in the first position. # Record the phrase and any single nouns that are found words = mnp.split words.length.times do |i| found[words.join(' ')] += 1 if words.length > 1 w = words.shift found[w] += 1 if w =~ /#{NN}/ end end ret = Hash.new(0) found.keys.each do |f| k = strip_tags(f) v = found[f] # We weight by the word count to favor long noun phrases space_count = k.scan(/\s+/) word_count = space_count.length + 1 # Throttle MNPs if necessary next if word_count > @conf[:longest_noun_phrase] k = stem(k) unless word_count > 1 # stem single words multiplier = 1 multiplier = word_count if @conf[:weight_noun_phrases] ret[k] += multiplier * v end return ret end # Reads some included corpus data and saves it in a stored hash on the # local file system. This is called automatically if the tagger can't # find the stored lexicon. def install puts "Creating part-of-speech lexicon" if @conf[:debug] load_tags(@conf[:tag_lex]) load_words(@conf[:word_lex]) load_words(@conf[:unknown_lex]) File.open(@conf[:word_path], 'w') do |f| Marshal.dump(@@lexicon, f) end File.open(@conf[:tag_path], 'w') do |f| Marshal.dump(@@hmm, f) end end ################### # Private methods # ################### :private # Downcase the first letter of word def lcfirst(word) word.split(//)[0].downcase + word.split(//)[1..-1].join end # Upcase the first letter of word def ucfirst(word) word.split(//)[0].upcase + word.split(//)[1..-1].join end # Return the word stem as given by Stemmable module. This can be # turned off with the class parameter @conf[:stem] => false. def stem(word) return word unless @conf[:stem] return word.stem end # This method will reset the preceeding tag to a sentence ender (PP). # This prepares the first word of a new sentence to be tagged correctly. def reset @conf[:current_tag] = 'pp' end # Check whether the text is a valid string def valid_text(text) if !text # there's nothing to parse "method call on uninitialized variable" if @conf[:debug] return false elsif /\A\s*\z/ =~ text # text is an empty string, nothing to parse return false else # $text is valid return true end end # Return a text string with the part-of-speech tags removed def strip_tags(tagged, downcase = false) return nil unless valid_text(tagged) text = tagged.gsub(/<[^>]+>/m, "") text = text.gsub(/\s+/m, " ") text = text.gsub(/\A\s*/, "") text = text.gsub(/\s*\z/, "") if downcase return text.downcase else return text end end # Strip the provided text of HTML-style tags and separate off any punctuation # in preparation for tagging def clean_text(text) return false unless valid_text(text) text = text.toutf8 unless $no_hpricot # Strip out any markup and convert entities to their proper form cleaned_text = Hpricot(text).inner_text else cleaned_text = text end tokenized = [] # Tokenize the text (splitting on punctuation as you go) cleaned_text.split(/\s+/).each do |line| tokenized += split_punct(line) end words = split_sentences(tokenized) return words end # This handles all of the trailing periods, keeping those that # belong on abbreviations and removing those that seem to be # at the end of sentences. This method makes some assumptions # about the use of capitalization in the incoming text def split_sentences(array) tokenized = array people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys supt det mssrs rev) army = %w(col gen lt cmdr adm capt sgt cpl maj brig) inst = %w(dept univ assn bros ph.d) place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy hwy hway la pde pd plz pl rd st tce) comp = %w(mfg inc ltd co corp) state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill ind ia kans kan ken ky la me md is mass mich minn miss mo mont neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt va wash wis wisc wy wyo usafa alta man ont que sask yuk) month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec) misc = %w(vs etc no esp) abbr = Hash.new [people, army, inst, place, comp, state, month, misc].flatten.each do |i| abbr[i] = true end words = Array.new tokenized.each_with_index do |t, i| if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/ w = $1 # Don't separate the period off words that # meet any of the following conditions: # # 1. It is defined in one of the lists above # 2. It is only one letter long: Alfred E. Sloan # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i words << w words << '.' next end end words << tokenized[i] end # If the final word ends in a period.. if words[-1] and words[-1] =~ /\A(.*\w)\.\z/ words[-1] = $1 words.push '.' end return words end # Separate punctuation from words, where appropriate. This leaves trailing # periods in place to be dealt with later. Called by the clean_text method. def split_punct(text) # If there's no punctuation, return immediately return [text] if /\A\w+\z/ =~ text # Sanity checks text = text.gsub(/\W{10,}/o, " ") # Put quotes into a standard format text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to `` text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to ` text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to '' text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes # Handle all other punctuation text = text.gsub(/--+/o, " - ") # Convert and separate dashes text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers text = text.gsub(/:/o, " :") # Shift semicolons off text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation # English-specific contractions text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's text = text.gsub(/n't\b/o, " n't") # Separate off n't text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're result = text.split(' ') return result end # Given a preceding tag, assign a tag word. Called by the add_tags method. # This method is a modified version of the Viterbi algorithm for part-of-speech tagging def assign_tag(prev_tag, word) if word == "-unknown-" # classify unknown words accordingly return @conf[:unknown_word_tag] elsif word == "-sym-" # If this is a symbol, tag it as a symbol return "sym" end best_so_far = 0 w = @@lexicon[word] t = @@hmm # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm # which is used in most POS taggers best_tag = "" t[prev_tag].keys.each do |tag| # With @config[:relax] set, this method # will also include any `open classes' of POS tags pw = 0 if w[tag] pw = w[tag] elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/ pw = 0 else next end # Bayesian logic: # P = P( tag | prev_tag ) * P( tag | word ) probability = t[prev_tag][tag] * (pw + 1) # Set the tag with maximal probability if probability > best_so_far best_so_far = probability best_tag = tag end end return best_tag end # This method determines whether a word should be considered in its # lower or upper case form. This is useful in considering proper nouns # and words that begin sentences. Called by add_tags. def clean_word(word) lcf = lcfirst(word) # seen this word as it appears (lower or upper case) if @@lexicon[word] return word elsif @@lexicon[lcf] # seen this word only as lower case return lcf else # never seen this word. guess. return classify_unknown_word(word) end end # This changes any word not appearing in the lexicon to identifiable # classes of words handled by a simple unknown word classification # metric. Called by the clean_word method. def classify_unknown_word(word) if /[\(\{\[]/ =~ word # Left brackets classified = "*LRB*" elsif /[\)\}\]]/ =~ word # Right brackets classified = "*RRB*" elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number classified = "*NUM*" elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs classified = "*NUM*" elsif /\A-?\d+\w+\z/o =~ word # Ordinal number classified = "*ORD*" elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps) classified = "-abr-" elsif /\w-\w/o =~ word # Hyphenated word /-([^-]+)\z/ =~ word h_suffix = $1 if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj']) # last part of this is defined as an adjective classified = "-hyp-adj-" else # last part of this is not defined as an adjective classified = "-hyp-" end elsif /\A\W+\z/o =~ word classified = "-sym-" # Symbol elsif word == ucfirst(word) classified = "-cap-" # Capitalized word elsif /ing\z/o =~ word classified = "-ing-" # Ends in 'ing' elsif /s\z/o =~ word classified = "-s-" # Ends in 's' elsif /tion\z/o =~ word classified = "-tion-" # Ends in 'tion' elsif /ly\z/o =~ word classified = "-ly-" # Ends in 'ly' elsif /ed\z/o =~ word classified = "-ed-" # Ends in 'ed else classified = "-unknown-" # Completely unknown end return classified end # This returns a compiled regexp for extracting maximal noun phrases # from a POS-tagged text. def get_max_noun_regex regex = / # optional number, gerund - adjective -participle (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})* # Followed by one or more nouns (?:#{NN})+ (?: # Optional preposition, determinant, cardinal (?:#{PREP})*(?:#{DET})?(?:#{NUM})? # Optional gerund-adjective -participle (?:#{GER}|#{ADJ}|#{PART})* # one or more nouns (?:#{NN})+ )* /xo #/ return regex end # Load the 2-grams into a hash from YAML data: This is a naive (but fast) # YAML data parser. It will load a YAML document with a collection of key: # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ). # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 } def load_tags(lexicon) path = File.join($lexpath, lexicon) fh = File.open(path, 'r') while line = fh.gets /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line next unless $1 and $2 key, data = $1, $2 tags = Hash.new items = data.split(/,\s+/) pairs = {} items.each do |i| /([^:]+):\s*(.+)/ =~ i pairs[$1] = $2.to_f end @@hmm[key] = pairs end fh.close end # Load the 2-grams into a hash from YAML data: This is a naive (but fast) # YAML data parser. It will load a YAML document with a collection of key: # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ). # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 } def load_words(lexicon) path = File.join($lexpath, lexicon) fh = File.open(path, 'r') while line = fh.gets /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line next unless $1 and $2 key, data = $1, $2 tags = Hash.new items = data.split(/,\s+/) pairs = {} items.each do |i| /([^:]+):\s*(.+)/ =~ i pairs[$1] = $2.to_f end @@lexicon[key] = pairs end fh.close end #memoize the stem and assign_tag methods memoize("stem") memoize("assign_tag") end