lib/twitter_ebooks/model.rb in twitter_ebooks-2.3.2 vs lib/twitter_ebooks/model.rb in twitter_ebooks-3.0.0

- old
+ new

@@ -6,16 +6,45 @@ require 'digest/md5' require 'csv' module Ebooks class Model - attr_accessor :hash, :tokens, :sentences, :mentions, :keywords + # @return [Array<String>] + # An array of unique tokens. This is the main source of actual strings + # in the model. Manipulation of a token is done using its index + # in this array, which we call a "tiki" + attr_accessor :tokens - def self.consume(txtpath) - Model.new.consume(txtpath) + # @return [Array<Array<Integer>>] + # Sentences represented by arrays of tikis + attr_accessor :sentences + + # @return [Array<Array<Integer>>] + # Sentences derived from Twitter mentions + attr_accessor :mentions + + # @return [Array<String>] + # The top 200 most important keywords, in descending order + attr_accessor :keywords + + # Generate a new model from a corpus file + # @param path [String] + # @return [Ebooks::Model] + def self.consume(path) + Model.new.consume(path) end + # Generate a new model from multiple corpus files + # @param paths [Array<String>] + # @return [Ebooks::Model] + def self.consume_all(paths) + Model.new.consume_all(paths) + end + + # Load a saved model + # @param path [String] + # @return [Ebooks::Model] def self.load(path) model = Model.new model.instance_eval do props = Marshal.load(File.open(path, 'rb') { |f| f.read }) @tokens = props[:tokens] @@ -24,10 +53,12 @@ @keywords = props[:keywords] end model end + # Save model to a file + # @param path [String] def save(path) File.open(path, 'wb') do |f| f.write(Marshal.dump({ tokens: @tokens, sentences: @sentences, @@ -37,23 +68,26 @@ end self end def initialize - # This is the only source of actual strings in the model. It is - # an array of unique tokens. Manipulation of a token is mostly done - # using its index in this array, which we call a "tiki" @tokens = [] # Reverse lookup tiki by token, for faster generation @tikis = {} end + # Reverse lookup a token index from a token + # @param token [String] + # @return [Integer] def tikify(token) @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1) end + # Convert a body of text into arrays of tikis + # @param text [String] + # @return [Array<Array<Integer>>] def mass_tikify(text) sentences = NLP.sentences(text) sentences.map do |s| tokens = NLP.tokenize(s).reject do |t| @@ -63,13 +97,14 @@ tokens.map { |t| tikify(t) } end end + # Consume a corpus into this model + # @param path [String] def consume(path) content = File.read(path, :encoding => 'utf-8') - @hash = Digest::MD5.hexdigest(content) if path.split('.')[-1] == "json" log "Reading json corpus from #{path}" lines = JSON.parse(content).map do |tweet| tweet['text'] @@ -85,10 +120,16 @@ else log "Reading plaintext corpus from #{path}" lines = content.split("\n") end + consume_lines(lines) + end + + # Consume a sequence of lines + # @param lines [Array<String>] + def consume_lines(lines) log "Removing commented lines and sorting mentions" statements = [] mentions = [] lines.each do |l| @@ -111,34 +152,66 @@ @sentences = mass_tikify(text) @mentions = mass_tikify(mention_text) log "Ranking keywords" - @keywords = NLP.keywords(text) + @keywords = NLP.keywords(text).top(200).map(&:to_s) self end - def fix(tweet) - # This seems to require an external api call - #begin - # fixer = NLP.gingerice.parse(tweet) - # log fixer if fixer['corrections'] - # tweet = fixer['result'] - #rescue Exception => e - # log e.message - # log e.backtrace - #end + # Consume multiple corpuses into this model + # @param paths [Array<String>] + def consume_all(paths) + lines = [] + paths.each do |path| + content = File.read(path, :encoding => 'utf-8') - NLP.htmlentities.decode tweet + if path.split('.')[-1] == "json" + log "Reading json corpus from #{path}" + l = JSON.parse(content).map do |tweet| + tweet['text'] + end + lines.concat(l) + elsif path.split('.')[-1] == "csv" + log "Reading CSV corpus from #{path}" + content = CSV.parse(content) + header = content.shift + text_col = header.index('text') + l = content.map do |tweet| + tweet[text_col] + end + lines.concat(l) + else + log "Reading plaintext corpus from #{path}" + l = content.split("\n") + lines.concat(l) + end + end + consume_lines(lines) end + # Correct encoding issues in generated text + # @param text [String] + # @return [String] + def fix(text) + NLP.htmlentities.decode text + end + + # Check if an array of tikis comprises a valid tweet + # @param tikis [Array<Integer>] + # @param limit Integer how many chars we have left def valid_tweet?(tikis, limit) tweet = NLP.reconstruct(tikis, @tokens) tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) end + # Generate some text + # @param limit [Integer] available characters + # @param generator [SuffixGenerator, nil] + # @param retry_limit [Integer] how many times to retry on duplicates + # @return [String] def make_statement(limit=140, generator=nil, retry_limit=10) responding = !generator.nil? generator ||= SuffixGenerator.build(@sentences) retries = 0 @@ -169,16 +242,21 @@ fix tweet end # Test if a sentence has been copied verbatim from original - def verbatim?(tokens) - @sentences.include?(tokens) || @mentions.include?(tokens) + # @param tikis [Array<Integer>] + # @return [Boolean] + def verbatim?(tikis) + @sentences.include?(tikis) || @mentions.include?(tikis) end - # Finds all relevant tokenized sentences to given input by + # Finds relevant and slightly relevant tokenized sentences to input # comparing non-stopword token overlaps + # @param sentences [Array<Array<Integer>>] + # @param input [String] + # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>] def find_relevant(sentences, input) relevant = [] slightly_relevant = [] tokenized = NLP.tokenize(input).map(&:downcase) @@ -195,9 +273,13 @@ [relevant, slightly_relevant] end # Generates a response by looking for related sentences # in the corpus and building a smaller generator from these + # @param input [String] + # @param limit [Integer] characters available for response + # @param sentences [Array<Array<Integer>>] + # @return [String] def make_response(input, limit=140, sentences=@mentions) # Prefer mentions relevant, slightly_relevant = find_relevant(sentences, input) if relevant.length >= 3