lib/datasets/wikipedia.rb in red-datasets-0.1.4 vs lib/datasets/wikipedia.rb in red-datasets-0.1.5

- old
+ new

@@ -1,8 +1,9 @@ require "rexml/streamlistener" require "rexml/parsers/baseparser" require "rexml/parsers/streamparser" +require "time" require_relative "dataset" module Datasets class Wikipedia < Dataset @@ -53,14 +54,12 @@ private def open_data(&block) base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2" data_path = cache_dir_path + base_name - unless data_path.exist? - data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}" - download(data_path, data_url) - end + data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}" + download(data_path, data_url) extract_bz2(data_path, &block) end def type_in_path @@ -151,10 +150,10 @@ def text(data) @text_stack.last << data end - def cdata(contnet) + def cdata(content) @text_stack.last << content end private def on_page(page)