#!/usr/bin/env ruby # encoding: utf-8 require 'json' require 'set' require 'digest/md5' module Ebooks class Model attr_accessor :hash, :sentences, :tokenized, :markov def self.consume(txtpath) Model.new.consume(txtpath) end def self.load(path) data = Marshal.load(File.read(path)) Model.new.deserialize(data) end def consume(txtpath) # Record hash of source file so we know to update later @hash = Digest::MD5.hexdigest(File.read(txtpath)) text = File.read(txtpath) log "Removing commented lines and mentions" lines = text.split("\n") keeping = [] lines.each do |l| next if l.start_with?('#') || l.include?('RT') processed = l.split.reject { |w| w.include?('@') || w.include?('http') } keeping << processed.join(' ') end text = NLP.normalize(keeping.join("\n")) log "Segmenting text into sentences of 140 characters or less" @sentences = NLP.sentences(text).reject do |s| s.length > 140 || s.count('"')%2 != 0 end log "Tokenizing #{@sentences.length} sentences" @tokenized = @sentences.map { |sent| NLP.tokenize(sent) } @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) } log "Building markov model (this may take a while)" @markov = MarkovModel.new.consume(@tokenized) self end # Produces a hash with the data needed to quickly # reconstruct this corpus object def serialize return { 'hash' => @hash, 'tokenized' => @tokenized, 'tokensets' => @tokensets, 'markov' => @markov.serialize } end def save(path) data = self.serialize File.open(path, 'w') do |f| f.write(Marshal.dump(data)) end self end def deserialize(data) @hash = data['hash'] @tokenized = data['tokenized'] @tokensets = data['tokensets'] @markov = MarkovModel.new.deserialize(data['markov']) self end def replace_noun(sent) tagged = NLP.tagger.add_tags(sent) nouns = tagged.scan(/([^<]+)<\/nn>/).flatten to_replace = nouns.reject { |n| ['much'].include?(n) }.sample return sent if to_replace.nil? replacement = NLP.nouns.sample if to_replace.en.plural.length <= to_replace.length replacement = replacement.en.plural(1) end sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement) sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a) end def fix(tweet) # This seems to require an external api call begin fixer = NLP.gingerice.parse(tweet) log fixer if fixer['corrections'] tweet = fixer['result'] rescue Exception => e log e.message log e.backtrace end NLP.htmlentities.decode tweet end def markov_statement(limit=140, markov=nil) markov ||= @markov tweet = "" while (tweet = markov.generate) do next if tweet.length > limit next if NLP.unmatched_enclosers?(tweet) break if tweet.length > limit*0.4 || rand > 0.8 end fix tweet end # Generates a response by looking for related sentences # in the corpus and building a smaller markov model from these def markov_response(input, limit=140) inputset = NLP.tokenset(input) log "Input tokenset: #{inputset.to_a}" if inputset.empty? # Very uninteresting input; no relevant response possible return markov_statement(limit) end # Let's find all the sentences that might be relevant relevant = [] @tokensets.each_with_index.map do |set, i| if inputset.intersection(set).length > 0 relevant << @tokenized[i] end end log "Found #{relevant.length} relevant tokenset matches" if relevant.length < 3 return markov_statement(limit) end markov = MarkovModel.new.consume(relevant.sample(100)) markov_statement(limit, markov) end end end