lib/birdwatcher/modules/statuses/word_cloud.rb in birdwatcher-0.3.1 vs lib/birdwatcher/modules/statuses/word_cloud.rb in birdwatcher-0.4.0
- old
+ new
@@ -93,12 +93,10 @@
:required => true
},
}
}
- DEFAULT_EXCLUDED_WORDS = %w(rt via oh)
-
def self.info
<<-INFO
The Word Cloud module can generate a classic weighted word cloud from words used
in statuses across all or specific users and between different times.
@@ -131,74 +129,43 @@
statuses = statuses.where("DATE(posted_at) >= DATE(?) AND DATE(posted_at) <= DATE(?)", since, before).all
if statuses.count.zero?
error("There are no statuses to process")
return false
end
- prepare_exclusion_list
- words = {}
- sorted_words = []
+ word_list = make_word_list(
+ :min_word_count => option_setting("MIN_WORD_COUNT"),
+ :min_word_length => option_setting("MIN_WORD_LENGTH"),
+ :exclude_words => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip),
+ :exclude_stopwords => option_setting("EXCLUDE_STOPWORDS"),
+ :exclude_common_words => option_setting("EXCLUDE_COMMON"),
+ :exclude_hashtags => option_setting("EXCLUDE_HASHTAGS"),
+ :exclude_mentions => option_setting("EXCLUDE_MENTIONS"),
+ :word_cap => option_setting("WORD_CAP"),
+ :stopwords_file => File.join(DATA_DIRECTORY, "english_stopwords.txt"),
+ :common_words_file => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt")
+ )
task("Processing #{statuses.count.to_s.bold} statuses...") do
statuses.each do |status|
- split_into_words(status.text).each do |word|
- next if exclude_word?(word)
- words.key?(word) ? words[word] += 1 : words[word] = 1
- end
+ word_list.add_to_corpus(status.text)
if option_setting("INCLUDE_PAGE_TITLES")
status.urls_dataset
- .where("title IS NOT NULL")
- .where("final_url NOT LIKE 'https://twitter.com/%'")
- .map(&:title).each do |page_title|
- split_into_words(page_title).each do |word|
- next if exclude_word?(word)
- words.key?(word) ? words[word] += 1 : words[word] = 1
- end
+ .where("title IS NOT NULL")
+ .where("final_url NOT LIKE 'https://twitter.com/%'")
+ .map(&:title).each do |page_title|
+ word_list.add_to_corpus(page_title)
end
end
end
- if option_setting("MIN_WORD_COUNT")
- words.delete_if { |word, count| count < option_setting("MIN_WORD_COUNT").to_i }
- end
- sorted_words = words.sort_by { |word, count| count}.reverse
- if option_setting("WORD_CAP")
- sorted_words = sorted_words.take(option_setting("WORD_CAP").to_i)
- end
+ word_list.process
end
task("Generating word cloud, patience please...") do
- cloud = MagicCloud::Cloud.new(sorted_words,
+ cloud = MagicCloud::Cloud.new(word_list.word_list,
:rotate => :none,
:palette => option_setting("PALETTE").split(" ").map(&:strip)
).draw(option_setting("IMAGE_WIDTH").to_i, option_setting("IMAGE_HEIGHT").to_i).to_blob { self.format = "png" }
File.open(option_setting("DEST"), "wb") { |f| f.write(cloud) }
end
info("Word cloud written to #{option_setting('DEST').bold}")
- end
-
- private
-
- def prepare_exclusion_list
- @exclusion_list = DEFAULT_EXCLUDED_WORDS
- if option_setting("EXCLUDE_WORDS")
- @exclusion_list += option_setting("EXCLUDE_WORDS").split(" ").map { |w| w.strip.downcase }
- end
- if option_setting("EXCLUDE_STOPWORDS")
- @exclusion_list += read_data_file("english_stopwords.txt").split("\n").map { |w| w.strip.downcase }
- end
- if option_setting("EXCLUDE_COMMON")
- @exclusion_list += read_data_file("top100Kenglishwords.txt").split("\n").map(&:strip)
- end
- end
-
- def exclude_word?(word)
- return true if word.empty?
- return true if option_setting("MIN_WORD_LENGTH") && word.length < option_setting("MIN_WORD_LENGTH").to_i
- return true if option_setting("EXCLUDE_HASHTAGS") && word.start_with?("#")
- return true if option_setting("EXCLUDE_MENTIONS") && word.start_with?("@")
- return true if @exclusion_list.include?(word)
- end
-
- def split_into_words(text)
- text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ")
- text.split(" ").map(&:strip)
end
end
end
end
end