lib/enter_rockstar/scraper/wikia.rb in enter-rockstar-0.1 vs lib/enter_rockstar/scraper/wikia.rb in enter-rockstar-0.2
- old
+ new
@@ -1,31 +1,30 @@
# frozen_string_literal: true
require 'open-uri'
require 'nokogiri'
-require 'json'
module EnterRockstar
module Scraper
# lyrics scraper for lyrics.wikia.com
class Wikia
START_HOST = 'http://lyrics.wikia.com'
DATA_DIR = 'lyrics'
SLEEP_BETWEEN_REQUESTS = 0.1
- attr_reader :tree
+ attr_reader :tree, :url, :category_name, :output
def initialize(category_name: 'heavy_metal', url: '/wiki/Category:Genre/Heavy_Metal', data_dir: 'lyrics_data')
@tree = {}
- @output = "#{data_dir}/wikia_#{category_name}.json"
+ @output = "#{data_dir}/wikia_#{category_name}.json.gz"
@url = url
@category_name = category_name
end
def parse_category(url: nil, test_limit: false)
url ||= START_HOST + @url
- html = URI.open(url)
+ html = URI.parse(url).open
doc = Nokogiri::HTML(html)
# get all category member links and sort them by band and album
doc.css('li.category-page__member a').each do |category_link|
next if category_link.attr('title').include?('Category:')
@@ -47,21 +46,15 @@
next_url = doc.css('a.category-page__pagination-next')&.first&.attr('href')
parse_category(url: next_url) unless next_url.nil?
end
def save_category
- puts
- out = File.new(@output, 'w')
- out.write @tree.to_json
- out.close
- puts "Saved JSON data to #{@output}"
+ EnterRockstar::Utils.save_file(@output, @tree.to_json)
end
def load_saved_json
- file = File.read(@output)
- @tree = JSON.parse(file)
- @new_tree = JSON.parse(file)
+ @tree = JSON.parse(EnterRockstar::Utils.load_json(@output))
end
def print_indexed_tree
@tree.each_with_index do |(key, _val), index|
puts "#{index}: #{key}"
@@ -76,41 +69,42 @@
val.each do |k, v|
dirname = k == 'band_url' ? [DATA_DIR, @category_name, key].join('/') : [DATA_DIR, @category_name, key, k].join('/')
FileUtils.mkdir_p dirname
- parse_page(v, dirname, key)
+ parse_page(v, dirname)
end
end
-
- @tree = @new_tree
- save_category
end
- def parse_page(url, dirname, band)
- puts url
+ def parse_page(url, dirname)
sleep SLEEP_BETWEEN_REQUESTS
- html = URI.open(START_HOST + url)
+ html = URI.parse(START_HOST + url).open
doc = Nokogiri::HTML(html)
if doc.css('h2 span.mw-headline a').count.zero?
# single album page listed on the category
doc.css('div.mw-content-text ol li a').each do |song|
- parse_song(song.attr('href'), dirname, song.text) if song&.attr('href')
+ next unless song&.attr('href')
+
+ lyrics = parse_song(song.attr('href'), dirname, song.text)
+ save_song("#{dirname}/#{song.text}.txt", lyrics) unless lyrics.nil?
end
puts
else
doc.css('h2 span.mw-headline a').each do |album|
puts album.text
# some band pages have extra albums that are not listed in the category page for some reason
album_dirname = [dirname, album.text].join('/')
FileUtils.mkdir_p album_dirname
- @new_tree[band][album.text] = album.attr('href')
# get song pages
album.parent.parent.css('+ div + ol > li a').each do |song|
- parse_song(song.attr('href'), album_dirname, song.text) if song&.attr('href')
+ next unless song&.attr('href')
+
+ lyrics = parse_song(song.attr('href'), album_dirname, song.text)
+ save_song("#{album_dirname}/#{song.text}.txt", lyrics) unless lyrics.nil?
end
puts
end
end
end
@@ -124,24 +118,21 @@
FileUtils.mkdir_p without_last.join('/')
return if File.exist?(songfile)
print '.'
sleep SLEEP_BETWEEN_REQUESTS
- html = URI.open(START_HOST + url)
+ html = URI.parse(START_HOST + url).open
doc = Nokogiri::HTML(html)
lyrics = doc.css('div.lyricbox').first
return if lyrics.nil?
+ return if lyrics.css('a')&.first&.attr('href') == '/wiki/Category:Instrumental'
- if lyrics.css('a')&.first&.attr('href') == '/wiki/Category:Instrumental'
- # instrumental song, whatever
- else
- proper_text = lyrics.inner_html.gsub(%r{<div.*?(\/div>)}, '').split('<br>').join("\n")
+ lyrics.inner_html.split('<br>').join("\n").gsub(%r{<\/?[^>]*>}, '')
+ end
- out = File.new(songfile, 'w')
- out.write proper_text
- out.close
- end
+ def save_song(songfile, contents)
+ EnterRockstar::Utils.save_plain(songfile, contents)
end
end
end
end