lib/datasets/penn-treebank.rb in red-datasets-0.0.6 vs lib/datasets/penn-treebank.rb in red-datasets-0.0.7

- old
+ new

@@ -1,10 +1,10 @@ require_relative "dataset" module Datasets class PennTreebank < Dataset - Record = Struct.new(:word, :id) + Record = Struct.new(:word) DESCRIPTION = <<~DESC `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a corpus of English sentences with linguistic structure annotations. This function uses a variant distributed at @@ -44,20 +44,13 @@ parse_data(data_path, &block) end private def parse_data(data_path) - index = 0 - vocabulary = {} File.open(data_path) do |f| f.each_line do |line| line.split.each do |word| - word = word.strip - unless vocabulary.key?(word) - vocabulary[word] = index - index += 1 - end - yield(Record.new(word, vocabulary[word])) + yield(Record.new(word.strip)) end end end end end