lib/datasets/wikipedia.rb in red-datasets-0.1.4 vs lib/datasets/wikipedia.rb in red-datasets-0.1.5
- old
+ new
@@ -1,8 +1,9 @@
require "rexml/streamlistener"
require "rexml/parsers/baseparser"
require "rexml/parsers/streamparser"
+require "time"
require_relative "dataset"
module Datasets
class Wikipedia < Dataset
@@ -53,14 +54,12 @@
private
def open_data(&block)
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
data_path = cache_dir_path + base_name
- unless data_path.exist?
- data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
- download(data_path, data_url)
- end
+ data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
+ download(data_path, data_url)
extract_bz2(data_path, &block)
end
def type_in_path
@@ -151,10 +150,10 @@
def text(data)
@text_stack.last << data
end
- def cdata(contnet)
+ def cdata(content)
@text_stack.last << content
end
private
def on_page(page)