require "rley" require "engtagger" require "pp" # REGEX to remove XML tags from Engtagger output GET_TAG = /<(.+?)>(.*?)<.+?>/ # Text tokenizer # Taken directly from Engtagger, will ensure uniform indexing while parsing def clean_text(text) return false unless valid_text(text) text = text.toutf8 cleaned_text = text tokenized = [] # Tokenize the text (splitting on punctuation as you go) cleaned_text.split(/\s+/).each do |line| tokenized += split_punct(line) end words = split_sentences(tokenized) return words end def valid_text(text) if !text # there's nothing to parse "method call on uninitialized variable" if @conf[:debug] return false elsif /\A\s*\z/ =~ text # text is an empty string, nothing to parse return false else # $text is valid return true end end def split_sentences(array) tokenized = array people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys supt det mssrs rev) army = %w(col gen lt cmdr adm capt sgt cpl maj brig) inst = %w(dept univ assn bros ph.d) place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy hwy hway la pde pd plz pl rd st tce) comp = %w(mfg inc ltd co corp) state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill ind ia kans kan ken ky la me md is mass mich minn miss mo mont neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt va wash wis wisc wy wyo usafa alta man ont que sask yuk) month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec) misc = %w(vs etc no esp) abbr = Hash.new [people, army, inst, place, comp, state, month, misc].flatten.each do |i| abbr[i] = true end words = Array.new tokenized.each_with_index do |t, i| if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/ w = $1 # Don't separate the period off words that # meet any of the following conditions: # # 1. It is defined in one of the lists above # 2. It is only one letter long: Alfred E. Sloan # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i words << w words << '.' next end end words << tokenized[i] end # If the final word ends in a period.. if words[-1] and words[-1] =~ /\A(.*\w)\.\z/ words[-1] = $1 words.push '.' end return words end # Separate punctuation from words, where appropriate. This leaves trailing # periods in place to be dealt with later. Called by the clean_text method. def split_punct(text) # If there's no punctuation, return immediately return [text] if /\A\w+\z/ =~ text # Sanity checks text = text.gsub(/\W{10,}/o, " ") # Put quotes into a standard format text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to `` text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to ` text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to '' text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes # Handle all other punctuation text = text.gsub(/--+/o, " - ") # Convert and separate dashes text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers text = text.gsub(/:/o, " :") # Shift semicolons off text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation # English-specific contractions text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's text = text.gsub(/n't\b/o, " n't") # Separate off n't text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're result = text.split(' ') return result end # Instantiate a builder object that will build the grammar for us builder = Rley::Syntax::GrammarBuilder.new do add_terminals('NN', 'NNP') add_terminals('DET', 'IN', 'VBD') # Here we define the productions (= grammar rules) rule 'S' => %w[NP VP] rule 'NP' => 'NNP' rule 'NP' => %w[DET NN] rule 'NP' => %w[DET NN PP] rule 'VP' => %w[VBD NP] rule 'VP' => %w[VBD NP PP] rule 'PP' => %w[IN NP] end # And now, let's build the grammar... grammar = builder.grammar parser = Rley::Parser::GFGEarleyParser.new(grammar) # text = "Yo I'm not done with you" text= "John saw Mary with a telescope" pp "Input text --> #{text}" tgr = EngTagger.new # Generte POS tagged = tgr.add_tags(text) # Generte tokenied lexicon of input text # Instead of creating a lexicon dictionary, we would simply generate one each time on the fly for the current text only. lexicon = clean_text(text) # Generte POS tokens in [[word, pos], ..] format tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] } def tokenizer(lexicon, grammar, tokens) rley_tokens = [] lexicon.each_with_index do |word, i| term_name = tokens[i].last terminal = grammar.name2symbol[term_name] rley_tokens << Rley::Tokens::Token.new(word, terminal) end return rley_tokens end # Convert input text into a sequence of rley token objects... rley_tokens = tokenizer(lexicon, grammar, tokens) result = parser.parse(rley_tokens) pp "Parsing successful? #{result.success?}" # => Parsing successful? true pp result.failure_reason.message unless result.success? ptree = result.parse_tree visitor = Rley::ParseTreeVisitor.new(ptree) renderer = Rley::Formatter::Asciitree.new($stdout) # Subscribe the formatter to the visitor's event and launch the visit pp renderer.render(visitor)