# frozen_string_literal: true

require 'open-uri'
require 'nokogiri'
require 'json'

module EnterRockstar
  module Scraper
    # lyrics scraper for lyrics.wikia.com
    class Wikia
      START_HOST = 'http://lyrics.wikia.com'
      DATA_DIR = 'lyrics'
      SLEEP_BETWEEN_REQUESTS = 0.1

      attr_reader :tree

      def initialize(category_name: 'heavy_metal', url: '/wiki/Category:Genre/Heavy_Metal', data_dir: 'lyrics_data')
        @tree = {}
        @output = "#{data_dir}/wikia_#{category_name}.json"
        @url = url
        @category_name = category_name
      end

      def parse_category(url: nil, test_limit: false)
        url ||= START_HOST + @url
        html = URI.open(url)
        doc = Nokogiri::HTML(html)

        # get all category member links and sort them by band and album
        doc.css('li.category-page__member a').each do |category_link|
          next if category_link.attr('title').include?('Category:')

          band, album = category_link.attr('title').split(':')
          @tree[band] ||= {}

          if album.nil?
            @tree[band]['band_url'] = category_link.attr('href')
          else
            @tree[band][album] = category_link.attr('href')
          end
        end

        return if test_limit # test only first page scraping so it's easier

        print '.'
        # get next page if one exists and parse that
        next_url = doc.css('a.category-page__pagination-next')&.first&.attr('href')
        parse_category(url: next_url) unless next_url.nil?
      end

      def save_category
        puts
        out = File.new(@output, 'w')
        out.write @tree.to_json
        out.close
        puts "Saved JSON data to #{@output}"
      end

      def load_saved_json
        file = File.read(@output)
        @tree = JSON.parse(file)
        @new_tree = JSON.parse(file)
      end

      def print_indexed_tree
        @tree.each_with_index do |(key, _val), index|
          puts "#{index}: #{key}"
        end
      end

      def parse_all_pages(start_index: 0)
        @tree.each_with_index do |(key, val), index|
          next if index < start_index

          puts "#{index}: #{key}"

          val.each do |k, v|
            dirname = k == 'band_url' ? [DATA_DIR, @category_name, key].join('/') : [DATA_DIR, @category_name, key, k].join('/')
            FileUtils.mkdir_p dirname

            parse_page(v, dirname, key)
          end
        end

        @tree = @new_tree
        save_category
      end

      def parse_page(url, dirname, band)
        puts url
        sleep SLEEP_BETWEEN_REQUESTS
        html = URI.open(START_HOST + url)
        doc = Nokogiri::HTML(html)

        if doc.css('h2 span.mw-headline a').count.zero?
          # single album page listed on the category
          doc.css('div.mw-content-text ol li a').each do |song|
            parse_song(song.attr('href'), dirname, song.text) if song&.attr('href')
          end
          puts
        else
          doc.css('h2 span.mw-headline a').each do |album|
            puts album.text
            # some band pages have extra albums that are not listed in the category page for some reason
            album_dirname = [dirname, album.text].join('/')
            FileUtils.mkdir_p album_dirname
            @new_tree[band][album.text] = album.attr('href')

            # get song pages
            album.parent.parent.css('+ div + ol > li a').each do |song|
              parse_song(song.attr('href'), album_dirname, song.text) if song&.attr('href')
            end
            puts
          end
        end
      end

      def parse_song(url, dirname, songname)
        return if url.start_with? 'http'

        songfile = "#{dirname}/#{songname}.txt"
        without_last = songfile.split('/')
        without_last.pop
        FileUtils.mkdir_p without_last.join('/')
        return if File.exist?(songfile)

        print '.'
        sleep SLEEP_BETWEEN_REQUESTS
        html = URI.open(START_HOST + url)
        doc = Nokogiri::HTML(html)

        lyrics = doc.css('div.lyricbox').first
        return if lyrics.nil?

        if lyrics.css('a')&.first&.attr('href') == '/wiki/Category:Instrumental'
          # instrumental song, whatever
        else
          proper_text = lyrics.inner_html.gsub(%r{<div.*?(\/div>)}, '').split('<br>').join("\n")

          out = File.new(songfile, 'w')
          out.write proper_text
          out.close
        end
      end
    end
  end
end