model.rb in twitter_ebooks-3.0.0

- old
+ new

@@ -6,16 +6,45 @@
 require 'digest/md5'
 require 'csv'
 
 module Ebooks
   class Model
-    attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
+    # @return [Array<String>]
+    # An array of unique tokens. This is the main source of actual strings
+    # in the model. Manipulation of a token is done using its index
+    # in this array, which we call a "tiki"
+    attr_accessor :tokens
 
-    def self.consume(txtpath)
-      Model.new.consume(txtpath)
+    # @return [Array<Array<Integer>>]
+    # Sentences represented by arrays of tikis
+    attr_accessor :sentences
+
+    # @return [Array<Array<Integer>>]
+    # Sentences derived from Twitter mentions
+    attr_accessor :mentions
+
+    # @return [Array<String>]
+    # The top 200 most important keywords, in descending order
+    attr_accessor :keywords
+
+    # Generate a new model from a corpus file
+    # @param path [String]
+    # @return [Ebooks::Model]
+    def self.consume(path)
+      Model.new.consume(path)
     end
 
+    # Generate a new model from multiple corpus files
+    # @param paths [Array<String>]
+    # @return [Ebooks::Model]
+    def self.consume_all(paths)
+      Model.new.consume_all(paths)
+    end
+
+    # Load a saved model
+    # @param path [String]
+    # @return [Ebooks::Model]
     def self.load(path)
       model = Model.new
       model.instance_eval do
         props = Marshal.load(File.open(path, 'rb') { |f| f.read })
         @tokens = props[:tokens]
@@ -24,10 +53,12 @@
         @keywords = props[:keywords]
       end
       model
     end
 
+    # Save model to a file
+    # @param path [String]
     def save(path)
       File.open(path, 'wb') do |f|
         f.write(Marshal.dump({
           tokens: @tokens,
           sentences: @sentences,
@@ -37,23 +68,26 @@
       end
       self
     end
 
     def initialize
-      # This is the only source of actual strings in the model. It is
-      # an array of unique tokens. Manipulation of a token is mostly done
-      # using its index in this array, which we call a "tiki"
       @tokens = []
 
       # Reverse lookup tiki by token, for faster generation
       @tikis = {}
     end
 
+    # Reverse lookup a token index from a token
+    # @param token [String]
+    # @return [Integer]
     def tikify(token)
       @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
     end
 
+    # Convert a body of text into arrays of tikis
+    # @param text [String]
+    # @return [Array<Array<Integer>>]
     def mass_tikify(text)
       sentences = NLP.sentences(text)
 
       sentences.map do |s|
         tokens = NLP.tokenize(s).reject do |t|
@@ -63,13 +97,14 @@
 
         tokens.map { |t| tikify(t) }
       end
     end
 
+    # Consume a corpus into this model
+    # @param path [String]
     def consume(path)
       content = File.read(path, :encoding => 'utf-8')
-      @hash = Digest::MD5.hexdigest(content)
 
       if path.split('.')[-1] == "json"
         log "Reading json corpus from #{path}"
         lines = JSON.parse(content).map do |tweet|
           tweet['text']
@@ -85,10 +120,16 @@
       else
         log "Reading plaintext corpus from #{path}"
         lines = content.split("\n")
       end
 
+      consume_lines(lines)
+    end
+
+    # Consume a sequence of lines
+    # @param lines [Array<String>]
+    def consume_lines(lines)
       log "Removing commented lines and sorting mentions"
 
       statements = []
       mentions = []
       lines.each do |l|
@@ -111,34 +152,66 @@
 
       @sentences = mass_tikify(text)
       @mentions = mass_tikify(mention_text)
 
       log "Ranking keywords"
-      @keywords = NLP.keywords(text)
+      @keywords = NLP.keywords(text).top(200).map(&:to_s)
 
       self
     end
 
-    def fix(tweet)
-      # This seems to require an external api call
-      #begin
-      #  fixer = NLP.gingerice.parse(tweet)
-      #  log fixer if fixer['corrections']
-      #  tweet = fixer['result']
-      #rescue Exception => e
-      #  log e.message
-      #  log e.backtrace
-      #end
+    # Consume multiple corpuses into this model
+    # @param paths [Array<String>]
+    def consume_all(paths)
+      lines = []
+      paths.each do |path|
+        content = File.read(path, :encoding => 'utf-8')
 
-      NLP.htmlentities.decode tweet
+        if path.split('.')[-1] == "json"
+          log "Reading json corpus from #{path}"
+          l = JSON.parse(content).map do |tweet|
+            tweet['text']
+          end
+          lines.concat(l)
+        elsif path.split('.')[-1] == "csv"
+          log "Reading CSV corpus from #{path}"
+          content = CSV.parse(content)
+          header = content.shift
+          text_col = header.index('text')
+          l = content.map do |tweet|
+            tweet[text_col]
+          end
+          lines.concat(l)
+        else
+          log "Reading plaintext corpus from #{path}"
+          l = content.split("\n")
+          lines.concat(l)
+        end
+      end
+      consume_lines(lines)
     end
 
+    # Correct encoding issues in generated text
+    # @param text [String]
+    # @return [String]
+    def fix(text)
+      NLP.htmlentities.decode text
+    end
+
+    # Check if an array of tikis comprises a valid tweet
+    # @param tikis [Array<Integer>]
+    # @param limit Integer how many chars we have left
     def valid_tweet?(tikis, limit)
       tweet = NLP.reconstruct(tikis, @tokens)
       tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
     end
 
+    # Generate some text
+    # @param limit [Integer] available characters
+    # @param generator [SuffixGenerator, nil]
+    # @param retry_limit [Integer] how many times to retry on duplicates
+    # @return [String]
     def make_statement(limit=140, generator=nil, retry_limit=10)
       responding = !generator.nil?
       generator ||= SuffixGenerator.build(@sentences)
 
       retries = 0
@@ -169,16 +242,21 @@
 
       fix tweet
     end
 
     # Test if a sentence has been copied verbatim from original
-    def verbatim?(tokens)
-      @sentences.include?(tokens) || @mentions.include?(tokens)
+    # @param tikis [Array<Integer>]
+    # @return [Boolean]
+    def verbatim?(tikis)
+      @sentences.include?(tikis) || @mentions.include?(tikis)
     end
 
-    # Finds all relevant tokenized sentences to given input by
+    # Finds relevant and slightly relevant tokenized sentences to input
     # comparing non-stopword token overlaps
+    # @param sentences [Array<Array<Integer>>]
+    # @param input [String]
+    # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
     def find_relevant(sentences, input)
       relevant = []
       slightly_relevant = []
 
       tokenized = NLP.tokenize(input).map(&:downcase)
@@ -195,9 +273,13 @@
       [relevant, slightly_relevant]
     end
 
     # Generates a response by looking for related sentences
     # in the corpus and building a smaller generator from these
+    # @param input [String]
+    # @param limit [Integer] characters available for response
+    # @param sentences [Array<Array<Integer>>]
+    # @return [String]
     def make_response(input, limit=140, sentences=@mentions)
       # Prefer mentions
       relevant, slightly_relevant = find_relevant(sentences, input)
 
       if relevant.length >= 3