require 'set'
require 'digest'

module Licensee
  module ContentHelper
    DIGEST = Digest::SHA1
    END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i
    HR_REGEX = /[=\-\*][=\-\*\s]{3,}/
    ALT_TITLE_REGEX = License::ALT_TITLE_REGEX
    ALL_RIGHTS_RESERVED_REGEX = /\Aall rights reserved\.?$/i
    WHITESPACE_REGEX = /\s+/
    MARKDOWN_HEADING_REGEX = /\A\s*#+/
    VERSION_REGEX = /\Aversion.*$/i
    MARKUP_REGEX = /[#_*=~\[\]()`|>]+/
    DEVELOPED_BY_REGEX = /\Adeveloped by:.*?\n\n/im
    QUOTE_BEGIN_REGEX = /[`'"‘“]/
    QUOTE_END_REGEX = /['"’”]/

    # A set of each word in the license, without duplicates
    def wordset
      @wordset ||= if content_normalized
        content_normalized.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
      end
    end

    # Number of characteres in the normalized content
    def length
      return 0 unless content_normalized
      content_normalized.length
    end

    # Number of characters that could be added/removed to still be
    # considered a potential match
    def max_delta
      @max_delta ||= (length * Licensee.inverse_confidence_threshold).to_i
    end

    # Given another license or project file, calculates the difference in length
    def length_delta(other)
      (length - other.length).abs
    end

    # Given another license or project file, calculates the similarity
    # as a percentage of words in common
    def similarity(other)
      overlap = (wordset & other.wordset).size
      total = wordset.size + other.wordset.size
      100.0 * (overlap * 2.0 / total)
    end

    # SHA1 of the normalized content
    def content_hash
      @content_hash ||= DIGEST.hexdigest content_normalized
    end

    # Content with the title and version removed
    # The first time should normally be the attribution line
    # Used to dry up `content_normalized` but we need the case sensitive
    # content with attribution first to detect attribuion in LicenseFile
    def content_without_title_and_version
      @content_without_title_and_version ||= begin
        string = content.strip
        string = strip_markdown_headings(string)
        string = strip_hrs(string)
        string = strip_title(string) while string =~ ContentHelper.title_regex
        strip_version(string).strip
      end
    end

    # Content without title, version, copyright, whitespace, or insturctions
    #
    # wrap - Optional width to wrap the content
    #
    # Returns a string
    def content_normalized(wrap: nil)
      return unless content
      @content_normalized ||= begin
        string = content_without_title_and_version.downcase
        while string =~ Matchers::Copyright::REGEX
          string = strip_copyright(string)
        end
        string = strip_all_rights_reserved(string)
        string = strip_developed_by(string)
        string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
        string = normalize_lists(string)
        string = normalize_quotes(string)
        string = normalize_https(string)
        string = strip_markup(string)
        strip_whitespace(string)
      end

      if wrap.nil?
        @content_normalized
      else
        Licensee::ContentHelper.wrap(@content_normalized, wrap)
      end
    end

    # Wrap text to the given line length
    def self.wrap(text, line_width = 80)
      return if text.nil?
      text = text.clone
      text.gsub!(/([^\n])\n([^\n])/, '\1 \2')

      text = text.split("\n").collect do |line|
        if line.length > line_width
          line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
        else
          line
        end
      end * "\n"

      text.strip
    end

    def self.format_percent(float)
      "#{format('%.2f', float)}%"
    end

    def self.title_regex
      licenses = Licensee::License.all(hidden: true, psuedo: false)
      titles = licenses.map(&:title_regex)

      # Title regex must include the version to support matching within
      # families, but for sake of normalization, we can be less strict
      without_versions = licenses.map do |license|
        next if license.title == license.name_without_version
        Regexp.new Regexp.escape(license.name_without_version), 'i'
      end
      titles.concat(without_versions.compact)

      /\A\s*\(?(the )?#{Regexp.union titles}.*$/i
    end

    private

    def strip_title(string)
      strip(string, ContentHelper.title_regex)
    end

    def strip_version(string)
      strip(string, VERSION_REGEX)
    end

    def strip_copyright(string)
      strip(string, Matchers::Copyright::REGEX)
    end

    # Strip HRs from MPL
    def strip_hrs(string)
      strip(string, HR_REGEX)
    end

    # Strip leading #s from the document
    def strip_markdown_headings(string)
      strip(string, MARKDOWN_HEADING_REGEX)
    end

    def strip_whitespace(string)
      strip(string, WHITESPACE_REGEX)
    end

    def strip_all_rights_reserved(string)
      strip(string, ALL_RIGHTS_RESERVED_REGEX)
    end

    def strip_markup(string)
      strip(string, MARKUP_REGEX)
    end

    def strip_developed_by(string)
      strip(string, DEVELOPED_BY_REGEX)
    end

    def strip(string, regex)
      string.gsub(regex, ' ').squeeze(' ').strip
    end

    # Replace all enclosing quotes with double quotes
    # Single versus double quotes don't alter the meaning, and it's easier to
    # strip double quotes if we still want to allow possessives
    def normalize_quotes(string)
      string.gsub(/#{QUOTE_BEGIN_REGEX}+([\w -]*?\w)#{QUOTE_END_REGEX}+/,
                  '"\1"')
    end

    def normalize_https(string)
      string.gsub(/http:/, 'https:')
    end

    def normalize_lists(string)
      string.gsub(/^\s*(\d\.|\*)/, '-')
    end
  end
end