Sha256: aa2f755a10cd08d3ae11db5bebd3f9f877207e28d3e260de5ca2b1b14640c5c0

Contents?: true

Size: 1.88 KB

Versions: 1

Compression:

Stored size: 1.88 KB

Contents

require 'phrasie'
require 'redcarpet'
require 'redcarpet/render_strip'

module WCC
  class TextAnalysis
    TEXT_BLACKLIST = /[…]/

    attr_reader :normalized, :stripped

    def self.extract_terms(file)
      File.read("db/#{file}.txt").split("\n")
    end

    def initialize(string, stop_words: STOPWORDS)
      @original = string
      @stop_words = stop_words
    end

    def terms_by_frequency(phrase_length: 1, min_occurrences: 3)
      @normalized ||= normalize(@original)
      terms = ::Phrasie::Extractor.new
        .phrases(@normalized, occur: min_occurrences)
        .select { |t| t.last == phrase_length }
        .map(&:first)
      remove_ignored_tokens terms
    end

    def evaluate_length
      @normalized ||= normalize(@original).downcase
      @stripped ||= remove_ignored_tokens(tokenize(@normalized))

      <<-OUTPUT.strip_heredoc
        Original
        \tLength: #{@original.length}
        \tWordcount: #{tokenize(@original).length}

        Normalized (removed markdown chars & whitespace)
        \tLength: #{@normalized.length}
        \tWordcount: #{tokenize(@normalized).length}

        Processed (removed above & stopwords)
        \tLength: #{@stripped.join(' ').length}
        \tWordcount: #{@stripped.length}
        \t   Unique: #{@stripped.uniq.length}
        \t   Unique Length: #{@stripped.uniq.join(' ').length}
      OUTPUT
    end

    private

    def normalize(string)
      ::Redcarpet::Markdown.new(::Redcarpet::Render::StripDown)
        .render(CGI.unescapeHTML(string))
        .gsub(TEXT_BLACKLIST, ' ')
        .gsub(/\s+/, ' ')
    end

    def tokenize(string)
      string.split(/\s+/)
    end

    def remove_ignored_tokens(string)
      string - (@stop_words + EXCLUSIONS)
    end

    # Stopwords from http://www.ranks.nl/stopwords
    STOPWORDS  = extract_terms("stop_words").freeze
    EXCLUSIONS = extract_terms("transcript_exclusions").freeze
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wcc-text-analysis-0.0.1 lib/wcc/text_analysis.rb