# frozen_string_literal: true require 'json' require 'find' require 'git' require 'net/http' require 'geo_combine/logger' module GeoCombine # Harvests Geoblacklight documents from OpenGeoMetadata for indexing class Harvester attr_reader :ogm_path, :schema_version # Non-metadata repositories that shouldn't be harvested def self.denylist [ 'GeoCombine', 'aardvark', 'metadata-issues', 'ogm_utils-python', 'opengeometadata.github.io', 'opengeometadata-rails', 'gbl-1_to_aardvark' ] end # GitHub API endpoint for OpenGeoMetadata repositories def self.ogm_api_uri URI('https://api.github.com/orgs/opengeometadata/repos?per_page=1000') end def initialize( ogm_path: ENV.fetch('OGM_PATH', 'tmp/opengeometadata'), schema_version: ENV.fetch('SCHEMA_VERSION', '1.0'), logger: GeoCombine::Logger.logger ) @ogm_path = ogm_path @schema_version = schema_version @logger = logger end # Enumerable of docs to index, for passing to an indexer def docs_to_index return to_enum(:docs_to_index) unless block_given? @logger.info "loading documents from #{ogm_path}" Find.find(@ogm_path) do |path| # skip non-json and layers.json files if File.basename(path) == 'layers.json' || !File.basename(path).end_with?('.json') @logger.debug "skipping #{path}; not a geoblacklight JSON document" next end doc = JSON.parse(File.read(path)) [doc].flatten.each do |record| # skip indexing if this record has a different schema version than what we want record_schema = record['gbl_mdVersion_s'] || record['geoblacklight_version'] record_id = record['layer_slug_s'] || record['dc_identifier_s'] if record_schema != @schema_version @logger.debug "skipping #{record_id}; schema version #{record_schema} doesn't match #{@schema_version}" next end @logger.debug "found record #{record_id} at #{path}" yield record, path end end end # Update a repository via git # If the repository doesn't exist, clone it. def pull(repo) repo_path = File.join(@ogm_path, repo) clone(repo) unless File.directory? repo_path Git.open(repo_path).pull @logger.info "updated #{repo}" repo end # Update all repositories # Return the names of repositories updated def pull_all updated = repositories.map(&method(:pull)).compact @logger.info "updated #{updated.size} repositories" updated end # Clone a repository via git # If the repository already exists, skip it. def clone(repo) repo_path = File.join(@ogm_path, repo) repo_info = repository_info(repo) repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git" # Skip if exists; warn if archived or empty if File.directory? repo_path @logger.warn "skipping clone to #{repo_path}; directory exists" return nil end @logger.warn "repository is archived: #{repo_url}" if repo_info['archived'] @logger.warn "repository is empty: #{repo_url}" if repo_info['size'].zero? Git.clone(repo_url, nil, path: ogm_path, depth: 1) @logger.info "cloned #{repo_url} to #{repo_path}" repo end # Clone all repositories via git # Return the names of repositories cloned. def clone_all cloned = repositories.map(&method(:clone)).compact @logger.info "cloned #{cloned.size} repositories" cloned end private # List of repository names to harvest def repositories @repositories ||= JSON.parse(Net::HTTP.get(self.class.ogm_api_uri)) .filter { |repo| repo['size'].positive? } .reject { |repo| repo['archived'] } .map { |repo| repo['name'] } .reject { |name| self.class.denylist.include? name } end def repository_info(repo_name) JSON.parse(Net::HTTP.get(URI("https://api.github.com/repos/opengeometadata/#{repo_name}"))) end end end