require 'enumerator'
require 'logger'
require 'zlib'
require 'stringio'
require 'net/http'
require 'net/https'
require 'uri'

class SEOException < Exception
end

class SEOChecker
  def initialize(url, options={})
    @url = url
    @locations = []
    @titles = {}
    @descriptions = {}
    @id_urls = {}
    @excessive_keywords = []
    @nesting_subdirectories = []
    @no_titles = []
    @no_descriptions = []
    @unreachables = []
    @batch_size = options[:batch_size] ? options[:batch_size].to_i : nil
    @interval_time = options[:interval_time].to_i
    @logger = options[:logger] == true ? Logger.new(STDOUT) : options[:logger]
  end

  def check
    begin
      check_robot
      check_sitemap("#{@url}/sitemap.xml") if @locations.empty?
      check_sitemap("#{@url}/sitemap.xml.gz") if @locations.empty?
      raise SEOException, "Error: There is no sitemap.xml or sitemap.xml.gz" if @locations.empty?

      check_location

      report
    rescue SEOException => e
      puts e.message
    end
  end

  def check_robot
    uri = URI.parse(@url)
    uri.path = '/robots.txt'
    response = get_response(uri)
    if response.is_a? Net::HTTPSuccess and response.body =~ /Sitemap:\s*(.*)/
      check_sitemap($1)
    end
  end

  def check_sitemap(url)
    @logger.debug "checking #{url} file" if @logger
    uri = URI.parse(url)
    response = get_response(uri)
    if response.is_a? Net::HTTPSuccess
      body = url =~ /gz$/ ? Zlib::GzipReader.new(StringIO.new(response.body)).read : response.body
      if body.index "<sitemap>"
        sitemap_locs = body.scan(%r{<loc>(.*?)</loc>}).flatten
        sitemap_locs.each { |loc| check_sitemap(loc) }
      else
        @locations = body.scan(%r{<loc>(.*?)</loc>}).flatten
      end
    end
  end

  def check_location
    @batch_size ||= @locations.size
    @locations.each_slice(@batch_size) do |batch_locations|
      batch_locations.each do |location|
        @logger.debug "checking #{location}" if @logger
        response = get_response(URI.parse(location))
        if response.is_a? Net::HTTPSuccess
          if response.body =~ %r{<head>(.*?)</head>}m
            check_title($1, location)
            check_description($1, location)
          else
            @no_titles << location
            @no_descriptions << location
          end
          check_url(location)
        else
          @unreachables << location
        end
      end
      sleep(@interval_time)
    end
  end

  private
    def get_response(uri)
      http = Net::HTTP.new(uri.host, uri.port)
      http.use_ssl = true if uri.scheme == 'https'
      request = Net::HTTP::Get.new(uri.request_uri)
      request["User-Agent"] = "seo-checker"
      response = http.request(request)
    end

    def check_title(header_string, location)
      if header_string =~ %r{<title>(.*?)</title>} && $1 != ''
        title = $1
        (@titles[title] ||= []) << location
      else
        @no_titles << location
      end
    end

    def check_description(header_string, location)
      if header_string =~ %r{<meta\s+name=["']description["']\s+content=["'](.*?)["']\s*/>|<meta\s+content=["'](.*?)["']\s+name=["']description["']\s*/>}
        description = $1 || $2
        (@descriptions[description] ||= []) << location
      else
        @no_descriptions << location
      end
    end

    def check_url(location)
      items = location.split('/')
      if items.find { |item| item =~ /^\d+$/ } || items.last =~ /^\d+\.htm(l)?/
        @id_urls[location.gsub(%r{/\d+/}, 'id')] = location
      end
      if items.find { |item| item.split('-').size > 5 }
        @excessive_keywords << location
      end
      if items.size > 8
        @nesting_subdirectories << location
      end
    end

    def report
      report_non_empty(@unreachables, "are unreachable.")
      report_non_empty(@no_titles, "have no title.")
      report_non_empty(@no_descriptions, "have no description.")
      report_same(@titles, 'title')
      report_same(@descriptions, 'description')
      report_non_empty(@id_urls.values, "use ID number in URL.")
      report_non_empty(@excessive_keywords, "use excessive keywords in URL.")
      report_non_empty(@nesting_subdirectories, "have deep nesting of subdirectories in URL.")
    end

    def report_same(variables, name)
      variables.each do |variable, locations|
        if locations.size > 1
          print "#{locations.slice(0, 5).join(",\n")} #{'and ...' if locations.size > 5} have the same #{name} '#{variable}'.\n\n"
        end
      end
    end

    def report_non_empty(variables, suffix)
      unless variables.empty?
        print "#{variables.slice(0, 5).join(",\n")} #{'and ...' if variables.size > 5} #{suffix}\n\n"
      end
    end
end