lib/twitter_ebooks/model.rb in twitter_ebooks-2.0.3 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.0.4
- old
+ new
@@ -5,101 +5,73 @@
require 'set'
require 'digest/md5'
module Ebooks
class Model
- attr_accessor :hash, :sentences, :tokenized, :markov
+ attr_accessor :hash, :sentences, :markov, :keywords
def self.consume(txtpath)
Model.new.consume(txtpath)
end
def self.load(path)
- data = Marshal.load(File.read(path))
- Model.new.deserialize(data)
+ Marshal.load(File.read(path))
end
def consume(txtpath)
# Record hash of source file so we know to update later
@hash = Digest::MD5.hexdigest(File.read(txtpath))
text = File.read(txtpath)
- log "Removing commented lines and mentions"
+ log "Removing commented lines and mention tokens"
lines = text.split("\n")
keeping = []
lines.each do |l|
next if l.start_with?('#') || l.include?('RT')
processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
keeping << processed.join(' ')
end
text = NLP.normalize(keeping.join("\n"))
- log "Segmenting text into sentences of 140 characters or less"
- @sentences = NLP.sentences(text).reject do |s|
- s.length > 140 || s.count('"')%2 != 0
- end
+ log "Segmenting text into sentences"
- log "Tokenizing #{@sentences.length} sentences"
- @tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
- @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
+ sentences = NLP.sentences(text)
- log "Building markov model (this may take a while)"
- @markov = MarkovModel.new.consume(@tokenized)
+ log "Tokenizing #{sentences.length} sentences"
+ @sentences = sentences.map { |sent| NLP.tokenize(sent) }
+ log "Building markov model"
+ @markov = MarkovModel.build(@sentences)
+
+ log "Ranking keywords"
+ require 'benchmark'
+ puts Benchmark.measure {
+ @keywords = NLP.keywords(@sentences)
+ p @keywords.top(100)
+ }
+
self
end
- # Produces a hash with the data needed to quickly
- # reconstruct this corpus object
- def serialize
- return { 'hash' => @hash,
- 'tokenized' => @tokenized,
- 'tokensets' => @tokensets,
- 'markov' => @markov.serialize }
- end
-
def save(path)
- data = self.serialize
File.open(path, 'w') do |f|
- f.write(Marshal.dump(data))
+ f.write(Marshal.dump(self))
end
self
end
- def deserialize(data)
- @hash = data['hash']
- @tokenized = data['tokenized']
- @tokensets = data['tokensets']
- @markov = MarkovModel.new.deserialize(data['markov'])
- self
- end
-
- def replace_noun(sent)
- tagged = NLP.tagger.add_tags(sent)
-
- nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
- to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
- return sent if to_replace.nil?
- replacement = NLP.nouns.sample
- if to_replace.en.plural.length <= to_replace.length
- replacement = replacement.en.plural(1)
- end
- sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
- sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
- end
-
def fix(tweet)
# This seems to require an external api call
- begin
- fixer = NLP.gingerice.parse(tweet)
- log fixer if fixer['corrections']
- tweet = fixer['result']
- rescue Exception => e
- log e.message
- log e.backtrace
- end
+ #begin
+ # fixer = NLP.gingerice.parse(tweet)
+ # log fixer if fixer['corrections']
+ # tweet = fixer['result']
+ #rescue Exception => e
+ # log e.message
+ # log e.backtrace
+ #end
NLP.htmlentities.decode tweet
end
def markov_statement(limit=140, markov=nil)
@@ -113,35 +85,46 @@
end
fix tweet
end
- # Generates a response by looking for related sentences
- # in the corpus and building a smaller markov model from these
- def markov_response(input, limit=140)
- inputset = NLP.tokenset(input)
- log "Input tokenset: #{inputset.to_a}"
+ # Finds all relevant tokenized sentences to given input by
+ # comparing non-stopword token overlaps
+ def relevant_sentences(input)
+ relevant = []
+ slightly_relevant = []
- if inputset.empty?
- # Very uninteresting input; no relevant response possible
- return markov_statement(limit)
- end
+ tokenized = NLP.tokenize(input)
- # Let's find all the sentences that might be relevant
- relevant = []
- @tokensets.each_with_index.map do |set, i|
- if inputset.intersection(set).length > 0
- relevant << @tokenized[i]
+ @sentences.each do |sent|
+ tokenized.each do |token|
+ if sent.include?(token)
+ relevant << sent unless NLP.stopword?(token)
+ slightly_relevant << sent
+ end
end
end
- log "Found #{relevant.length} relevant tokenset matches"
+ [relevant, slightly_relevant]
+ end
- if relevant.length < 3
- return markov_statement(limit)
- end
+ # Generates a response by looking for related sentences
+ # in the corpus and building a smaller markov model from these
+ def markov_response(input, limit=140)
+ # First try
+ relevant, slightly_relevant = relevant_sentences(input)
- markov = MarkovModel.new.consume(relevant.sample(100))
- markov_statement(limit, markov)
+ p relevant
+ p slightly_relevant.length
+
+ if relevant.length >= 3
+ markov = MarkovModel.new.consume(relevant)
+ markov_statement(limit, markov)
+ elsif slightly_relevant.length > 5
+ markov = MarkovModel.new.consume(slightly_relevant)
+ markov_statement(limit, markov)
+ else
+ markov_statement(limit)
+ end
end
end
end