lib/twitter_ebooks/model.rb in twitter_ebooks-2.2.9 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.3.0

- old
+ new

@@ -16,22 +16,35 @@ def self.load(path) Marshal.load(File.open(path, 'rb') { |f| f.read }) end - def mass_tokenize(text) + def initialize + # This is the only source of actual strings in the model. It is + # an array of unique tokens. Manipulation of a token is mostly done + # using its index in this array, which we call a "tiki" + @tokens = [] + + # Reverse lookup tiki by token, for faster generation + @tikis = {} + end + + def tikify(token) + @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1) + end + + def mass_tikify(text) sentences = NLP.sentences(text) - tokens = [] - sentences.each do |s| - tokens << NLP.tokenize(s).reject do |t| + sentences.map do |s| + tokens = NLP.tokenize(s).reject do |t| # Don't include usernames/urls as tokens t.include?('@') || t.include?('http') end - end - tokens + tokens.map { |t| tikify(t) } + end end def consume(path) content = File.read(path, :encoding => 'utf-8') @hash = Digest::MD5.hexdigest(content) @@ -61,28 +74,28 @@ lines.each do |l| next if l.start_with?('#') # Remove commented lines next if l.include?('RT') || l.include?('MT') # Remove soft retweets if l.include?('@') - statements << NLP.normalize(l) - else mentions << NLP.normalize(l) + else + statements << NLP.normalize(l) end end text = statements.join("\n") mention_text = mentions.join("\n") lines = nil; statements = nil; mentions = nil # Allow garbage collection log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" - @sentences = mass_tokenize(text) - @mentions = mass_tokenize(mention_text) + @sentences = mass_tikify(text) + @mentions = mass_tikify(mention_text) log "Ranking keywords" - @keywords = NLP.keywords(@sentences) + @keywords = NLP.keywords(text) self end def save(path) @@ -104,40 +117,40 @@ #end NLP.htmlentities.decode tweet end - def valid_tweet?(tokens, limit) - tweet = NLP.reconstruct(tokens) + def valid_tweet?(tikis, limit) + tweet = NLP.reconstruct(tikis, @tokens) tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) end def make_statement(limit=140, generator=nil, retry_limit=10) responding = !generator.nil? generator ||= SuffixGenerator.build(@sentences) retries = 0 tweet = "" - while (tokens = generator.generate(3, :bigrams)) do - next if tokens.length <= 3 && !responding - break if valid_tweet?(tokens, limit) + while (tikis = generator.generate(3, :bigrams)) do + next if tikis.length <= 3 && !responding + break if valid_tweet?(tikis, limit) retries += 1 break if retries >= retry_limit end - if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident - while (tokens = generator.generate(3, :unigrams)) do - break if valid_tweet?(tokens, limit) && !verbatim?(tokens) + if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident + while (tikis = generator.generate(3, :unigrams)) do + break if valid_tweet?(tikis, limit) && !verbatim?(tikis) retries += 1 break if retries >= retry_limit end end - tweet = NLP.reconstruct(tokens) + tweet = NLP.reconstruct(tikis, @tokens) if retries >= retry_limit log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\"" end @@ -157,10 +170,10 @@ tokenized = NLP.tokenize(input).map(&:downcase) sentences.each do |sent| tokenized.each do |token| - if sent.map(&:downcase).include?(token) + if sent.map { |tiki| @tokens[tiki].downcase }.include?(token) relevant << sent unless NLP.stopword?(token) slightly_relevant << sent end end end