lib/birdwatcher/modules/statuses/word_cloud.rb in birdwatcher-0.3.1 vs lib/birdwatcher/modules/statuses/word_cloud.rb in birdwatcher-0.4.0

- old
+ new

@@ -93,12 +93,10 @@ :required => true }, } } - DEFAULT_EXCLUDED_WORDS = %w(rt via oh) - def self.info <<-INFO The Word Cloud module can generate a classic weighted word cloud from words used in statuses across all or specific users and between different times. @@ -131,74 +129,43 @@ statuses = statuses.where("DATE(posted_at) >= DATE(?) AND DATE(posted_at) <= DATE(?)", since, before).all if statuses.count.zero? error("There are no statuses to process") return false end - prepare_exclusion_list - words = {} - sorted_words = [] + word_list = make_word_list( + :min_word_count => option_setting("MIN_WORD_COUNT"), + :min_word_length => option_setting("MIN_WORD_LENGTH"), + :exclude_words => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip), + :exclude_stopwords => option_setting("EXCLUDE_STOPWORDS"), + :exclude_common_words => option_setting("EXCLUDE_COMMON"), + :exclude_hashtags => option_setting("EXCLUDE_HASHTAGS"), + :exclude_mentions => option_setting("EXCLUDE_MENTIONS"), + :word_cap => option_setting("WORD_CAP"), + :stopwords_file => File.join(DATA_DIRECTORY, "english_stopwords.txt"), + :common_words_file => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt") + ) task("Processing #{statuses.count.to_s.bold} statuses...") do statuses.each do |status| - split_into_words(status.text).each do |word| - next if exclude_word?(word) - words.key?(word) ? words[word] += 1 : words[word] = 1 - end + word_list.add_to_corpus(status.text) if option_setting("INCLUDE_PAGE_TITLES") status.urls_dataset - .where("title IS NOT NULL") - .where("final_url NOT LIKE 'https://twitter.com/%'") - .map(&:title).each do |page_title| - split_into_words(page_title).each do |word| - next if exclude_word?(word) - words.key?(word) ? words[word] += 1 : words[word] = 1 - end + .where("title IS NOT NULL") + .where("final_url NOT LIKE 'https://twitter.com/%'") + .map(&:title).each do |page_title| + word_list.add_to_corpus(page_title) end end end - if option_setting("MIN_WORD_COUNT") - words.delete_if { |word, count| count < option_setting("MIN_WORD_COUNT").to_i } - end - sorted_words = words.sort_by { |word, count| count}.reverse - if option_setting("WORD_CAP") - sorted_words = sorted_words.take(option_setting("WORD_CAP").to_i) - end + word_list.process end task("Generating word cloud, patience please...") do - cloud = MagicCloud::Cloud.new(sorted_words, + cloud = MagicCloud::Cloud.new(word_list.word_list, :rotate => :none, :palette => option_setting("PALETTE").split(" ").map(&:strip) ).draw(option_setting("IMAGE_WIDTH").to_i, option_setting("IMAGE_HEIGHT").to_i).to_blob { self.format = "png" } File.open(option_setting("DEST"), "wb") { |f| f.write(cloud) } end info("Word cloud written to #{option_setting('DEST').bold}") - end - - private - - def prepare_exclusion_list - @exclusion_list = DEFAULT_EXCLUDED_WORDS - if option_setting("EXCLUDE_WORDS") - @exclusion_list += option_setting("EXCLUDE_WORDS").split(" ").map { |w| w.strip.downcase } - end - if option_setting("EXCLUDE_STOPWORDS") - @exclusion_list += read_data_file("english_stopwords.txt").split("\n").map { |w| w.strip.downcase } - end - if option_setting("EXCLUDE_COMMON") - @exclusion_list += read_data_file("top100Kenglishwords.txt").split("\n").map(&:strip) - end - end - - def exclude_word?(word) - return true if word.empty? - return true if option_setting("MIN_WORD_LENGTH") && word.length < option_setting("MIN_WORD_LENGTH").to_i - return true if option_setting("EXCLUDE_HASHTAGS") && word.start_with?("#") - return true if option_setting("EXCLUDE_MENTIONS") && word.start_with?("@") - return true if @exclusion_list.include?(word) - end - - def split_into_words(text) - text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ") - text.split(" ").map(&:strip) end end end end end