lib/datasets/wikipedia.rb in red-datasets-0.0.8 vs lib/datasets/wikipedia.rb in red-datasets-0.0.9
- old
+ new
@@ -50,26 +50,18 @@
parser.parse
end
end
private
- def open_data
+ def open_data(&block)
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
data_path = cache_dir_path + base_name
unless data_path.exist?
data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
download(data_path, data_url)
end
- input, output = IO.pipe
- pid = spawn("bzcat", data_path.to_s, {:out => output})
- begin
- output.close
- yield(input)
- ensure
- input.close
- Process.waitpid(pid)
- end
+ extract_bz2(data_path, &block)
end
def type_in_path
case @type
when :articles