word_cloud.rb in birdwatcher-0.4.0

- old
+ new

@@ -93,12 +93,10 @@
               :required    => true
             },
           }
         }
 
-        DEFAULT_EXCLUDED_WORDS = %w(rt via oh)
-
         def self.info
 <<-INFO
 The Word Cloud module can generate a classic weighted word cloud from words used
 in statuses across all or specific users and between different times.
 
@@ -131,74 +129,43 @@
           statuses = statuses.where("DATE(posted_at) >= DATE(?) AND DATE(posted_at) <= DATE(?)", since, before).all
           if statuses.count.zero?
             error("There are no statuses to process")
             return false
           end
-          prepare_exclusion_list
-          words        = {}
-          sorted_words = []
+          word_list = make_word_list(
+            :min_word_count       => option_setting("MIN_WORD_COUNT"),
+            :min_word_length      => option_setting("MIN_WORD_LENGTH"),
+            :exclude_words        => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip),
+            :exclude_stopwords    => option_setting("EXCLUDE_STOPWORDS"),
+            :exclude_common_words => option_setting("EXCLUDE_COMMON"),
+            :exclude_hashtags     => option_setting("EXCLUDE_HASHTAGS"),
+            :exclude_mentions     => option_setting("EXCLUDE_MENTIONS"),
+            :word_cap             => option_setting("WORD_CAP"),
+            :stopwords_file       => File.join(DATA_DIRECTORY, "english_stopwords.txt"),
+            :common_words_file    => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt")
+          )
           task("Processing #{statuses.count.to_s.bold} statuses...") do
             statuses.each do |status|
-              split_into_words(status.text).each do |word|
-                next if exclude_word?(word)
-                words.key?(word) ? words[word] += 1 : words[word] = 1
-              end
+              word_list.add_to_corpus(status.text)
               if option_setting("INCLUDE_PAGE_TITLES")
                 status.urls_dataset
-                .where("title IS NOT NULL")
-                .where("final_url NOT LIKE 'https://twitter.com/%'")
-                .map(&:title).each do |page_title|
-                  split_into_words(page_title).each do |word|
-                    next if exclude_word?(word)
-                    words.key?(word) ? words[word] += 1 : words[word] = 1
-                  end
+                  .where("title IS NOT NULL")
+                  .where("final_url NOT LIKE 'https://twitter.com/%'")
+                  .map(&:title).each do |page_title|
+                  word_list.add_to_corpus(page_title)
                 end
               end
             end
-            if option_setting("MIN_WORD_COUNT")
-              words.delete_if { |word, count| count < option_setting("MIN_WORD_COUNT").to_i }
-            end
-            sorted_words = words.sort_by { |word, count| count}.reverse
-            if option_setting("WORD_CAP")
-              sorted_words = sorted_words.take(option_setting("WORD_CAP").to_i)
-            end
+            word_list.process
           end
           task("Generating word cloud, patience please...") do
-            cloud = MagicCloud::Cloud.new(sorted_words,
+            cloud = MagicCloud::Cloud.new(word_list.word_list,
               :rotate  => :none,
               :palette => option_setting("PALETTE").split(" ").map(&:strip)
             ).draw(option_setting("IMAGE_WIDTH").to_i, option_setting("IMAGE_HEIGHT").to_i).to_blob { self.format = "png" }
             File.open(option_setting("DEST"), "wb") { |f| f.write(cloud) }
           end
           info("Word cloud written to #{option_setting('DEST').bold}")
-        end
-
-        private
-
-        def prepare_exclusion_list
-          @exclusion_list = DEFAULT_EXCLUDED_WORDS
-          if option_setting("EXCLUDE_WORDS")
-            @exclusion_list += option_setting("EXCLUDE_WORDS").split(" ").map { |w| w.strip.downcase }
-          end
-          if option_setting("EXCLUDE_STOPWORDS")
-            @exclusion_list += read_data_file("english_stopwords.txt").split("\n").map { |w| w.strip.downcase }
-          end
-          if option_setting("EXCLUDE_COMMON")
-            @exclusion_list += read_data_file("top100Kenglishwords.txt").split("\n").map(&:strip)
-          end
-        end
-
-        def exclude_word?(word)
-          return true if word.empty?
-          return true if option_setting("MIN_WORD_LENGTH") && word.length < option_setting("MIN_WORD_LENGTH").to_i
-          return true if option_setting("EXCLUDE_HASHTAGS") && word.start_with?("#")
-          return true if option_setting("EXCLUDE_MENTIONS") && word.start_with?("@")
-          return true if @exclusion_list.include?(word)
-        end
-
-        def split_into_words(text)
-          text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ")
-          text.split(" ").map(&:strip)
         end
       end
     end
   end
 end