Sha256: 22e9f40bf493581fa8de465fffd903c5a39cc0dea14af11741044b9d41d008d9

Contents?: true

Size: 1.97 KB

Versions: 2

Compression:

Stored size: 1.97 KB

Contents

# frozen_string_literal: true

require 'whatlanguage'
require 'ruby-progressbar'

module EnterRockstar
  module Corpus
    # take the downloaded lyrics texts and tokenize them
    class Tokenizer
      def initialize(data_dir:, name:)
        @data_dir = data_dir
        @stats = {}
        @tokens = {}
        @output_stats = "lyrics_data/#{name}_stats.json.gz"
        @output_tokens = "lyrics_data/#{name}_tokens.json.gz"
        @wl = WhatLanguage.new(:all)
      end

      def tokenize
        text_files = Dir.glob("#{@data_dir}/**/*.txt")
        puts "Parsing #{text_files.count} files."
        progressbar = ProgressBar.create(title: 'Progress', total: text_files.count)

        text_files.each do |filename|
          # read the lyrics and tokenize the words
          text = IO.read(filename)

          # Rockstar doesn't really work well with languages other than English
          if @wl.language(text) == :english
            tokenized = _to_tokens(text)
            # save stats which word appears after which one
            n = 3
            tokenized.each_cons(n) do |*head, continuation|
              @stats[head] ||= Hash.new(0)

              @stats[head][continuation] += 1
            end

            # save the words themselves based on what length they are
            tokenized.each do |token|
              next if token.length < 4 # shorter words are boring anyway

              @tokens[token.length] ||= []
              @tokens[token.length].push token unless @tokens[token.length].include? token
            end
            progressbar.increment
          else
            progressbar.increment
            next
          end
        end
        puts
      end

      def save_all
        EnterRockstar::Utils.save_file(@output_tokens, @tokens.to_json)
        EnterRockstar::Utils.save_file(@output_stats, @stats.to_json)
      end

      private

      def _to_tokens(text)
        text.downcase.split(/[^[[:alpha:]]]+/).reject(&:empty?).map(&:to_sym)
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
enter-rockstar-0.2.1 lib/enter_rockstar/corpus/tokenizer.rb
enter-rockstar-0.2 lib/enter_rockstar/corpus/tokenizer.rb