require "English"
require "rexml/document"

require_relative "dataset"

module Datasets
  class LIBSVMDatasetList < Dataset
    File = Struct.new(:name,
                      :url,
                      :note)
    class Record < Struct.new(:name,
                              :source,
                              :preprocessing,
                              :n_classes,
                              :n_data,
                              :n_features,
                              :files)
      def to_h
        hash = super
        hash[:files] = hash[:files].collect(&:to_h)
        hash
      end
    end

    def initialize
      super()
      @metadata.id = "libsvm-dataset-list"
      @metadata.name = "LIBSVM dataset list"
      @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
      @metadata.description = lambda do
        extract_description
      end
    end

    def each
      return to_enum(__method__) unless block_given?

      open_data do |input|
        # TODO: Improve performance
        document = REXML::Document.new(input)
        is_header = true
        document.each_element("//tr") do |tr|
          if is_header
            is_header = false
            next
          end
          name = tr.elements.first
          a = name.elements.first
          href = a.attributes["href"]
          record = Record.new
          record.name = a.text
          record.files = []
          parse_detail(href, record)
          yield(record)
        end
      end
    end

    private
    def open_data
      data_path = cache_dir_path + "index.html"
      unless data_path.exist?
        download(data_path, @metadata.url)
      end
      ::File.open(data_path) do |input|
        yield(input)
      end
    end

    def extract_description
      open_data do |input|
        document = REXML::Document.new(input)
        description = []
        in_content = false
        document.each_element("//body/*") do |element|
          unless in_content
            in_content = (element.name == "h1")
            next
          end
          break if element.name == "hr"
          content = extract_text(element)
          description << content unless content.empty?
        end
        description.join("\n\n")
      end
    end

    def extract_text(element)
      texts = REXML::XPath.match(element, ".//text()")
      texts.join("").gsub(/[ \t\n]+/, " ").strip
    end

    def open_detail(detail)
      data_path = cache_dir_path + detail
      unless data_path.exist?
        download(data_path, @metadata.url + detail)
      end
      ::File.open(data_path) do |input|
        yield(input)
      end
    end

    def parse_detail(href, record)
      path, id = href.split("#")
      open_detail(path) do |detail|
        detail_document = REXML::Document.new(detail)
        anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
        ul = anchor.next_sibling
        ul.each_element do |li|
          text = extract_text(li)
          case text
          when /\ASource: /
            record.source = $POSTMATCH
          when /\APreprocessing: /
            record.preprocessing = $POSTMATCH
          when /\A\# of classes: (\d+)/
            record.n_classes = Integer($1, 10)
          when /\A\# of data: ([\d,]+)/
            record.n_data = Integer($1.gsub(/,/, ""), 10)
          when /\A\# of features: ([\d,]+)/
            record.n_features = Integer($1.gsub(/,/, ""), 10)
          when /\AFiles:/
            li.elements.first.each_element do |file_li|
              file_a = file_li.elements.first
              file = File.new
              file.name = file_a.text
              file.url = @metadata.url + file_a.attributes["href"]
              file_note = file_li.text
              file.note = file_note.strip.gsub(/[()]/, "") if file_note
              record.files << file
            end
          end
        end
      end
    end
  end
end