lib/datasets/wikipedia.rb in red-datasets-0.1.5 vs lib/datasets/wikipedia.rb in red-datasets-0.1.6

- old
+ new

@@ -51,16 +51,25 @@ parser.parse end end private + def base_name + "#{@language}wiki-latest-#{type_in_path}.xml.bz2" + end + + def data_path + cache_dir_path + base_name + end + def open_data(&block) - base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2" - data_path = cache_dir_path + base_name data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}" - download(data_path, data_url) - - extract_bz2(data_path, &block) + bz2 = Enumerator.new do |yielder| + download(data_path, data_url) do |bz2_chunk| + yielder << bz2_chunk + end + end + extract_bz2(bz2, &block) end def type_in_path case @type when :articles