Sha256: aa2f755a10cd08d3ae11db5bebd3f9f877207e28d3e260de5ca2b1b14640c5c0
Contents?: true
Size: 1.88 KB
Versions: 1
Compression:
Stored size: 1.88 KB
Contents
require 'phrasie' require 'redcarpet' require 'redcarpet/render_strip' module WCC class TextAnalysis TEXT_BLACKLIST = /[…]/ attr_reader :normalized, :stripped def self.extract_terms(file) File.read("db/#{file}.txt").split("\n") end def initialize(string, stop_words: STOPWORDS) @original = string @stop_words = stop_words end def terms_by_frequency(phrase_length: 1, min_occurrences: 3) @normalized ||= normalize(@original) terms = ::Phrasie::Extractor.new .phrases(@normalized, occur: min_occurrences) .select { |t| t.last == phrase_length } .map(&:first) remove_ignored_tokens terms end def evaluate_length @normalized ||= normalize(@original).downcase @stripped ||= remove_ignored_tokens(tokenize(@normalized)) <<-OUTPUT.strip_heredoc Original \tLength: #{@original.length} \tWordcount: #{tokenize(@original).length} Normalized (removed markdown chars & whitespace) \tLength: #{@normalized.length} \tWordcount: #{tokenize(@normalized).length} Processed (removed above & stopwords) \tLength: #{@stripped.join(' ').length} \tWordcount: #{@stripped.length} \t Unique: #{@stripped.uniq.length} \t Unique Length: #{@stripped.uniq.join(' ').length} OUTPUT end private def normalize(string) ::Redcarpet::Markdown.new(::Redcarpet::Render::StripDown) .render(CGI.unescapeHTML(string)) .gsub(TEXT_BLACKLIST, ' ') .gsub(/\s+/, ' ') end def tokenize(string) string.split(/\s+/) end def remove_ignored_tokens(string) string - (@stop_words + EXCLUSIONS) end # Stopwords from http://www.ranks.nl/stopwords STOPWORDS = extract_terms("stop_words").freeze EXCLUSIONS = extract_terms("transcript_exclusions").freeze end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
wcc-text-analysis-0.0.1 | lib/wcc/text_analysis.rb |