lib/enter_rockstar/scraper/wikia.rb in enter-rockstar-0.1 vs lib/enter_rockstar/scraper/wikia.rb in enter-rockstar-0.2

- old
+ new

@@ -1,31 +1,30 @@ # frozen_string_literal: true require 'open-uri' require 'nokogiri' -require 'json' module EnterRockstar module Scraper # lyrics scraper for lyrics.wikia.com class Wikia START_HOST = 'http://lyrics.wikia.com' DATA_DIR = 'lyrics' SLEEP_BETWEEN_REQUESTS = 0.1 - attr_reader :tree + attr_reader :tree, :url, :category_name, :output def initialize(category_name: 'heavy_metal', url: '/wiki/Category:Genre/Heavy_Metal', data_dir: 'lyrics_data') @tree = {} - @output = "#{data_dir}/wikia_#{category_name}.json" + @output = "#{data_dir}/wikia_#{category_name}.json.gz" @url = url @category_name = category_name end def parse_category(url: nil, test_limit: false) url ||= START_HOST + @url - html = URI.open(url) + html = URI.parse(url).open doc = Nokogiri::HTML(html) # get all category member links and sort them by band and album doc.css('li.category-page__member a').each do |category_link| next if category_link.attr('title').include?('Category:') @@ -47,21 +46,15 @@ next_url = doc.css('a.category-page__pagination-next')&.first&.attr('href') parse_category(url: next_url) unless next_url.nil? end def save_category - puts - out = File.new(@output, 'w') - out.write @tree.to_json - out.close - puts "Saved JSON data to #{@output}" + EnterRockstar::Utils.save_file(@output, @tree.to_json) end def load_saved_json - file = File.read(@output) - @tree = JSON.parse(file) - @new_tree = JSON.parse(file) + @tree = JSON.parse(EnterRockstar::Utils.load_json(@output)) end def print_indexed_tree @tree.each_with_index do |(key, _val), index| puts "#{index}: #{key}" @@ -76,41 +69,42 @@ val.each do |k, v| dirname = k == 'band_url' ? [DATA_DIR, @category_name, key].join('/') : [DATA_DIR, @category_name, key, k].join('/') FileUtils.mkdir_p dirname - parse_page(v, dirname, key) + parse_page(v, dirname) end end - - @tree = @new_tree - save_category end - def parse_page(url, dirname, band) - puts url + def parse_page(url, dirname) sleep SLEEP_BETWEEN_REQUESTS - html = URI.open(START_HOST + url) + html = URI.parse(START_HOST + url).open doc = Nokogiri::HTML(html) if doc.css('h2 span.mw-headline a').count.zero? # single album page listed on the category doc.css('div.mw-content-text ol li a').each do |song| - parse_song(song.attr('href'), dirname, song.text) if song&.attr('href') + next unless song&.attr('href') + + lyrics = parse_song(song.attr('href'), dirname, song.text) + save_song("#{dirname}/#{song.text}.txt", lyrics) unless lyrics.nil? end puts else doc.css('h2 span.mw-headline a').each do |album| puts album.text # some band pages have extra albums that are not listed in the category page for some reason album_dirname = [dirname, album.text].join('/') FileUtils.mkdir_p album_dirname - @new_tree[band][album.text] = album.attr('href') # get song pages album.parent.parent.css('+ div + ol > li a').each do |song| - parse_song(song.attr('href'), album_dirname, song.text) if song&.attr('href') + next unless song&.attr('href') + + lyrics = parse_song(song.attr('href'), album_dirname, song.text) + save_song("#{album_dirname}/#{song.text}.txt", lyrics) unless lyrics.nil? end puts end end end @@ -124,24 +118,21 @@ FileUtils.mkdir_p without_last.join('/') return if File.exist?(songfile) print '.' sleep SLEEP_BETWEEN_REQUESTS - html = URI.open(START_HOST + url) + html = URI.parse(START_HOST + url).open doc = Nokogiri::HTML(html) lyrics = doc.css('div.lyricbox').first return if lyrics.nil? + return if lyrics.css('a')&.first&.attr('href') == '/wiki/Category:Instrumental' - if lyrics.css('a')&.first&.attr('href') == '/wiki/Category:Instrumental' - # instrumental song, whatever - else - proper_text = lyrics.inner_html.gsub(%r{<div.*?(\/div>)}, '').split('<br>').join("\n") + lyrics.inner_html.split('<br>').join("\n").gsub(%r{<\/?[^>]*>}, '') + end - out = File.new(songfile, 'w') - out.write proper_text - out.close - end + def save_song(songfile, contents) + EnterRockstar::Utils.save_plain(songfile, contents) end end end end