model.rb in twitter_ebooks-2.0.4

- old
+ new

@@ -5,101 +5,73 @@
 require 'set'
 require 'digest/md5'
 
 module Ebooks
   class Model
-    attr_accessor :hash, :sentences, :tokenized, :markov
+    attr_accessor :hash, :sentences, :markov, :keywords
 
     def self.consume(txtpath)
       Model.new.consume(txtpath)
     end
 
     def self.load(path)
-      data = Marshal.load(File.read(path))
-      Model.new.deserialize(data)
+      Marshal.load(File.read(path))
     end
 
     def consume(txtpath)
       # Record hash of source file so we know to update later
       @hash = Digest::MD5.hexdigest(File.read(txtpath))
 
       text = File.read(txtpath)
-      log "Removing commented lines and mentions"
+      log "Removing commented lines and mention tokens"
 
       lines = text.split("\n")
       keeping = []
       lines.each do |l|
         next if l.start_with?('#') || l.include?('RT')
         processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
         keeping << processed.join(' ')
       end
       text = NLP.normalize(keeping.join("\n"))
 
-      log "Segmenting text into sentences of 140 characters or less"
-      @sentences = NLP.sentences(text).reject do |s|
-        s.length > 140 || s.count('"')%2 != 0
-      end
+      log "Segmenting text into sentences"
 
-      log "Tokenizing #{@sentences.length} sentences"
-      @tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
-      @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
+      sentences = NLP.sentences(text)
 
-      log "Building markov model (this may take a while)"
-      @markov = MarkovModel.new.consume(@tokenized)
+      log "Tokenizing #{sentences.length} sentences"
+      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
 
+      log "Building markov model"
+      @markov = MarkovModel.build(@sentences)
+
+      log "Ranking keywords"
+      require 'benchmark'
+      puts Benchmark.measure {
+        @keywords = NLP.keywords(@sentences)
+        p @keywords.top(100)
+      }
+
       self
     end
 
-    # Produces a hash with the data needed to quickly 
-    # reconstruct this corpus object
-    def serialize
-      return { 'hash' => @hash,
-               'tokenized' => @tokenized,
-               'tokensets' => @tokensets,
-               'markov' => @markov.serialize }
-    end
-
     def save(path)
-      data = self.serialize
       File.open(path, 'w') do |f|
-        f.write(Marshal.dump(data))
+        f.write(Marshal.dump(self))
       end
       self
     end
 
-    def deserialize(data)
-      @hash = data['hash']
-      @tokenized = data['tokenized']
-      @tokensets = data['tokensets']
-      @markov = MarkovModel.new.deserialize(data['markov'])
-      self
-    end
-
-    def replace_noun(sent)
-      tagged = NLP.tagger.add_tags(sent)
-
-      nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
-      to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
-      return sent if to_replace.nil?
-      replacement = NLP.nouns.sample
-      if to_replace.en.plural.length <= to_replace.length
-        replacement = replacement.en.plural(1)
-      end
-      sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
-      sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
-    end
-
     def fix(tweet)
       # This seems to require an external api call
-      begin
-        fixer = NLP.gingerice.parse(tweet)
-        log fixer if fixer['corrections']
-        tweet = fixer['result']
-      rescue Exception => e
-        log e.message
-        log e.backtrace
-      end
+      #begin
+      #  fixer = NLP.gingerice.parse(tweet)
+      #  log fixer if fixer['corrections']
+      #  tweet = fixer['result']
+      #rescue Exception => e
+      #  log e.message
+      #  log e.backtrace
+      #end
 
       NLP.htmlentities.decode tweet
     end
 
     def markov_statement(limit=140, markov=nil)
@@ -113,35 +85,46 @@
       end
 
       fix tweet
     end
 
-    # Generates a response by looking for related sentences
-    # in the corpus and building a smaller markov model from these
-    def markov_response(input, limit=140)
-      inputset = NLP.tokenset(input)
-      log "Input tokenset: #{inputset.to_a}"
+    # Finds all relevant tokenized sentences to given input by
+    # comparing non-stopword token overlaps
+    def relevant_sentences(input)
+      relevant = []
+      slightly_relevant = []
 
-      if inputset.empty?
-        # Very uninteresting input; no relevant response possible
-        return markov_statement(limit)
-      end
+      tokenized = NLP.tokenize(input)
 
-      # Let's find all the sentences that might be relevant
-      relevant = []
-      @tokensets.each_with_index.map do |set, i|
-        if inputset.intersection(set).length > 0 
-          relevant << @tokenized[i]
+      @sentences.each do |sent|
+        tokenized.each do |token|
+          if sent.include?(token)
+            relevant << sent unless NLP.stopword?(token)
+            slightly_relevant << sent
+          end
         end
       end
 
-      log "Found #{relevant.length} relevant tokenset matches"
+      [relevant, slightly_relevant]
+    end
 
-      if relevant.length < 3
-        return markov_statement(limit)
-      end
+    # Generates a response by looking for related sentences
+    # in the corpus and building a smaller markov model from these
+    def markov_response(input, limit=140)
+      # First try 
+      relevant, slightly_relevant = relevant_sentences(input)
 
-      markov = MarkovModel.new.consume(relevant.sample(100))
-      markov_statement(limit, markov)
+      p relevant
+      p slightly_relevant.length
+
+      if relevant.length >= 3
+        markov = MarkovModel.new.consume(relevant)
+        markov_statement(limit, markov)
+      elsif slightly_relevant.length > 5
+        markov = MarkovModel.new.consume(slightly_relevant)
+        markov_statement(limit, markov)
+      else
+        markov_statement(limit)
+      end
     end
   end
 end