require "faraday"
require "json"
require "yaml"
require "open-uri"

module GitHub
  class GitHubAdvisorySync

    # Sync makes sure there are rubysec advisories for all GitHub advisories
    # It writes a set of yaml files, one for each GitHub Advisory that
    # is not already present in this repo
    #
    # The min_year argument specifies the earliest year CVE to sync.
    # It is more important to sync the newer ones, so this allows the user to
    # control how old of CVEs the sync should pull over
    def self.sync(min_year: 2015)
      gh_advisories = GraphQLAPIClient.new.retrieve_all_rubygem_publishable_advisories

      # Filter out advisories with a CVE year that is before the min_year
      gh_advisories.select! do |advisory|
        if advisory.cve_id
          _, cve_year = advisory.cve_id.match(/^CVE-(\d+)-\d+$/).to_a
          cve_year.to_i >= min_year
        else
          true # all advisories without a CVE are included too
        end
      end

      files_written = []
      gh_advisories.each do |advisory|
        files_written += advisory.write_files
      end

      puts "\nSync completed"
      if files_written.empty?
        puts "Nothing to sync today! All CVEs starting from #{min_year} are already present"
      else
        puts "Wrote these files:\n#{files_written.to_yaml}"
      end

      files_written
    end
  end

  class GraphQLAPIClient
    GITHUB_API_URL = "https://api.github.com/graphql"

    GitHubApiTokenMissingError = Class.new(StandardError)

    # return a lazy initialized connection to github api
    def github_api(adapter = :net_http)
      @faraday_connection ||= begin
        puts "Initializing GitHub API connection to URL: #{GITHUB_API_URL}"
        Faraday.new do |conn_builder|
          conn_builder.adapter adapter
          conn_builder.headers = {
            "User-Agent" => "rubysec/ruby-advisory-db rubysec sync script",
            "Content-Type" => "application/json",
            "Authorization" => "token #{github_api_token}"
          }
        end
      end
      @faraday_connection
    end

    # An error class which gets raised when a GraphQL request fails
    GitHubGraphQLAPIError = Class.new(StandardError)

    # all interactions with the API go through this method to standardize
    # error checking and how queries and requests are formed
    def github_graphql_query(graphql_query_name, graphql_variables = {})
      graphql_query_str = GraphQLQueries.const_get graphql_query_name
      graphql_body = JSON.generate query: graphql_query_str,
                                   variables: graphql_variables
      puts "Executing GraphQL request: #{graphql_query_name}. Request variables:\n#{graphql_variables.to_yaml}\n"
      faraday_response = github_api.post do |req|
        req.url GITHUB_API_URL
        req.body = graphql_body
      end
      puts "Got response code: #{faraday_response.status}"
      if faraday_response.status != 200
        raise(GitHubGraphQLAPIError, "GitHub GraphQL request to #{faraday_response.env.url} failed: #{faraday_response.body}")
      end
      body_obj = JSON.parse faraday_response.body
      if body_obj["errors"]
        raise(GitHubGraphQLAPIError, body_obj["errors"].map { |e| e["message"] }.join(", "))
      end
      body_obj
    end

    def retrieve_all_github_advisories(max_pages = 1000, page_size = 100)
      all_advisories = []
      variables = { "first" => page_size }
      max_pages.times do |page_num|
        puts "Getting page #{page_num + 1} of GitHub Advisories"
        page = github_graphql_query(:GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY, variables)
        advisories_this_page = page["data"]["securityAdvisories"]["nodes"]
        all_advisories += advisories_this_page
        break unless page["data"]["securityAdvisories"]["pageInfo"]["hasNextPage"] == true
        variables["after"] = page["data"]["securityAdvisories"]["pageInfo"]["endCursor"]
      end
      puts "Retrieved #{all_advisories.length} Advisories from GitHub API"

      all_advisories.map do |advisory_graphql_obj|
        GitHubAdvisory.new github_advisory_graphql_object: advisory_graphql_obj
      end
    end

    def retrieve_all_rubygem_publishable_advisories
      all_advisories = retrieve_all_github_advisories
      # remove withdrawn advisories,
      # and remove those where there are no vulnerabilities for ruby
      all_advisories.reject { |advisory| advisory.withdrawn? }
                    .select { |advisory| advisory.has_ruby_vulnerabilities? }
    end

    module GraphQLQueries
      GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY = <<-GRAPHQL.freeze
        query($first: Int, $after: String) {
          securityAdvisories(first: $first, after: $after) {
            pageInfo {
              endCursor
              hasNextPage
              hasPreviousPage
              startCursor
            }
            nodes {
              identifiers {
                type
                value
              }
              summary
              description
              severity
              references {
                url
              }
              publishedAt
              withdrawnAt
              vulnerabilities(ecosystem:RUBYGEMS, first: 10) {
                nodes {
                  package {
                    name
                    ecosystem
                  }
                  vulnerableVersionRange
                  firstPatchedVersion {
                    identifier
                  }
                }
              }
            }
          }
        }
      GRAPHQL
    end

    private

    def github_api_token
      unless ENV["GH_API_TOKEN"]
        raise GitHubApiTokenMissingError, "Unable to make API requests.  Must define 'GH_API_TOKEN' environment variable."
      end
      ENV["GH_API_TOKEN"]
    end
  end

  class GitHubAdvisory

    attr_reader :github_advisory_graphql_object

    def initialize(github_advisory_graphql_object:)
      @github_advisory_graphql_object = github_advisory_graphql_object
    end

    def identifier_list
      github_advisory_graphql_object["identifiers"]
    end

    # extract the CVE identifier from the GitHub Advisory identifier list
    def cve_id
      cve_id_obj = identifier_list.find { |id| id["type"] == "CVE" }
      return nil unless cve_id_obj

      cve_id_obj["value"]
    end

    def ghsa_id
      id_obj = identifier_list.find { |id| id["type"] == "GHSA" }
      id_obj["value"]
    end

    # advisories should be identified by CVE ID if there is one
    # but for maintainer submitted advisories there may not be one,
    # so a GitHub Security Advisory ID (ghsa_id) is used instead
    def primary_id
      return cve_id if cve_id
      ghsa_id
    end

    # return a date as a string like 2019-03-21.
    def published_day
      return nil unless github_advisory_graphql_object["publishedAt"]

      pub_date = Date.parse(github_advisory_graphql_object["publishedAt"])
      # pub_date.strftime("%Y-%m-%d")
      pub_date
    end

    def package_names
      github_advisory_graphql_object["vulnerabilities"]["nodes"].map{|v| v["package"]["name"]}.uniq
    end

    def rubysec_filenames
      package_names.map do |package_name|
        File.join("gems", package_name, "#{cve_id}.yml")
      end
    end

    def withdrawn?
      !github_advisory_graphql_object["withdrawnAt"].nil?
    end

    def external_reference
      github_advisory_graphql_object["references"].first["url"]
    end

    def vulnerabilities
      github_advisory_graphql_object["vulnerabilities"]["nodes"]
    end

    def has_ruby_vulnerabilities?
      vulnerabilities.any? do |vuln|
        vuln["package"]["ecosystem"] == "RUBYGEMS"
      end
    end

    def some_rubysec_files_do_not_exist?
      rubysec_filenames.any?{|filename| !File.exist?(filename) }
    end

    def write_files
      return [] unless some_rubysec_files_do_not_exist?

      files_written = []
      vulnerabilities.each do |vulnerability|
        filename_to_write = File.join("gems", vulnerability["package"]["name"], "#{primary_id}.yml")
        next if File.exist?(filename_to_write)

        data = {
          "gem" => vulnerability["package"]["name"],
          "ghsa" => ghsa_id[5..],
          "url" => external_reference,
          "date" => published_day,
          "title" => github_advisory_graphql_object["summary"],
          "description" => github_advisory_graphql_object["description"],
          "cvss_v3" => "<FILL IN IF AVAILABLE>",
          "patched_versions" => [ "<FILL IN SEE BELOW>" ],
          "unaffected_versions" => [ "<OPTIONAL: FILL IN SEE BELOW>" ]
        }
        data["cve"] = cve_id[4..20] if cve_id

        dir_to_write = File.dirname(filename_to_write)
        Dir.mkdir dir_to_write unless Dir.exist?(dir_to_write)
        File.open(filename_to_write, "w") do |file|
          # create an automatically generated advisory yaml file
          file.write data.to_yaml

          # The data we just wrote is incomplete,
          # and therefore should not be committed as is
          # We can not directly translate from GitHub to rubysec advisory format
          #
          # The patched_versions field is not exactly available.
          # - GitHub has a first_patched_version field,
          #   but rubysec advisory needs a ruby version spec
          #
          # The unnaffected_versions field is similarly not directly available
          # This optional field must be inferred from the vulnerableVersionRange
          #
          # To help write those fields, we put all the github data below.
          #
          # The second block of yaml in a .yaml file is ignored (after the second "---" line)
          # This effectively makes this data a large comment
          # Still it should be removed before the data goes into rubysec
          file.write "\n\n# GitHub advisory data below - **Remove this data before committing**\n"
          file.write "# Use this data to write patched_versions (and potentially unaffected_versions) above\n"
          file.write github_advisory_graphql_object.to_yaml
        end
        puts "Wrote: #{filename_to_write}"
        files_written << filename_to_write
      end

      files_written
    end
  end
end