lib/datasets/penn-treebank.rb in red-datasets-0.0.6 vs lib/datasets/penn-treebank.rb in red-datasets-0.0.7
- old
+ new
@@ -1,10 +1,10 @@
require_relative "dataset"
module Datasets
class PennTreebank < Dataset
- Record = Struct.new(:word, :id)
+ Record = Struct.new(:word)
DESCRIPTION = <<~DESC
`Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
corpus of English sentences with linguistic structure annotations. This
function uses a variant distributed at
@@ -44,20 +44,13 @@
parse_data(data_path, &block)
end
private
def parse_data(data_path)
- index = 0
- vocabulary = {}
File.open(data_path) do |f|
f.each_line do |line|
line.split.each do |word|
- word = word.strip
- unless vocabulary.key?(word)
- vocabulary[word] = index
- index += 1
- end
- yield(Record.new(word, vocabulary[word]))
+ yield(Record.new(word.strip))
end
end
end
end
end