# frozen_string_literal: true

module Commonmeta
  module Readers
    module SchemaOrgReader
      SO_TO_DC_RELATION_TYPES = {
        'citation' => 'References',
        'isBasedOn' => 'IsSupplementedBy',
        'sameAs' => 'IsIdenticalTo',
        'isPartOf' => 'IsPartOf',
        'hasPart' => 'HasPart',
        'isPredecessor' => 'IsPreviousVersionOf',
        'isSuccessor' => 'IsNewVersionOf'
      }.freeze

      SO_TO_DC_REVERSE_RELATION_TYPES = {
        'citation' => 'IsReferencedBy',
        'isBasedOn' => 'IsSupplementTo',
        'sameAs' => 'IsIdenticalTo',
        'isPartOf' => 'HasPart',
        'hasPart' => 'IsPartOf',
        'isPredecessor' => 'IsNewVersionOf',
        'isSuccessor' => 'IsPreviousVersionOf'
      }.freeze

      def get_schema_org(id: nil, **_options)
        return { 'string' => nil, 'state' => 'not_found' } unless id.present?

        url = normalize_id(id)
        # follow redirects
        response = HTTP.follow.get(url)
        return { 'string' => nil, 'state' => 'not_found' } unless response.status.success?

        doc = Nokogiri::HTML(response.body.to_s)

        # workaround for xhtml documents
        nodeset = doc.at("script[type='application/ld+json']")
        hsh = JSON.parse(nodeset || '{}')

        # workaround for doi as canonical_url but not included with schema.org
        link = doc.css("link[rel='canonical']")
        hsh['@id'] = link[0]['href'] if link.present?

        # workaround if license not included with schema.org
        license = doc.at("meta[name='dc.rights']")
        hsh['license'] = license['content'] if license.present?

        # workaround for html language attribute if no language is set via schema.org
        lang = doc.at("meta[name='dc.language']") || doc.at("meta[name='citation_language']")
        lang = lang['content'] if lang.present?
        lang = doc.at('html')['lang'] if lang.blank?
        hsh['inLanguage'] = lang if hsh['inLanguage'].blank?

        # workaround if issn not included with schema.org
        name = doc.at("meta[property='og:site_name']")
        issn = doc.at("meta[name='citation_issn']")
        hsh['isPartOf'] = { 'name' => name ? name['content'] : nil,
                            'issn' => issn ? issn['content'] : nil }.compact

        # workaround if not all authors are included with schema.org (e.g. in Ghost metadata)
        authors = doc.css("meta[name='citation_author']").map do |author|
          { '@type' => 'Person', 'name' => author['content'] }
        end

        hsh['author'] = hsh['creator'] if hsh['author'].blank? && hsh['creator'].present?
        hsh['author'] = authors if authors.length > Array.wrap(hsh['author']).length

        # workaround if publisher not included with schema.org (e.g. Zenodo)
        if hsh['publisher'].blank?
          publisher = doc.at("meta[property='og:site_name']")
          publisher = publisher['content'] if publisher.present?
          hsh['publisher'] = { 'name' => publisher }
        end

        { 'string' => hsh.to_json }
      end

      def read_schema_org(string: nil, **options)
        if string.present?
          errors = jsonlint(string)
          return { 'errors' => errors } if errors.present?
        end

        read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url,
                                                                                   :sandbox, :validate, :ra))

        meta = string.present? ? JSON.parse(string) : {}

        alternate_identifiers = Array.wrap(meta.fetch('identifier', nil)).map do |r|
          r = normalize_id(r) if r.is_a?(String)
          if r.is_a?(String) && URI(r).host != 'doi.org'
            { 'alternateIdentifierType' => 'URL', 'alternateIdentifier' => r }
          elsif r.is_a?(Hash) && r['propertyID'] != 'doi'
            { 'alternateIdentifierType' => get_identifier_type(r['propertyID']),
              'alternateIdentifier' => r['value'] }
          end
        end.compact.uniq

        id = options[:doi]
        id = meta.fetch('@id', nil) if id.blank? && URI(meta.fetch('@id', '')).host == 'doi.org'
        id = meta.fetch('identifier', nil) if id.blank?
        id = normalize_id(id)

        schema_org = meta.fetch('@type', nil) && meta.fetch('@type').camelcase
        type = Commonmeta::Utils::SO_TO_CM_TRANSLATIONS[schema_org]
        additional_type = meta.fetch('additionalType', nil)
        authors = meta.fetch('author', nil) || meta.fetch('creator', nil)
        # Authors should be an object, if it's just a plain string don't try and parse it.
        creators = get_authors(from_schema_org(Array.wrap(authors))) unless authors.is_a?(String)
        contributors = get_authors(from_schema_org(Array.wrap(meta.fetch('editor', nil))))
        publisher = parse_attributes(meta.fetch('publisher', nil), content: 'name', first: true)

        ct = schema_org == 'Dataset' ? 'includedInDataCatalog' : 'Periodical'
        container = if meta.fetch(ct, nil).present?
                      url = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'url',
                                                                                   first: true)

                      {
                        'type' => schema_org == 'Dataset' ? 'DataRepository' : 'Periodical',
                        'title' => parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'name',
                                                                                          first: true),
                        'identifier' => url,
                        'identifierType' => url.present? ? 'URL' : nil,
                        'volume' => meta.fetch('volumeNumber', nil),
                        'issue' => meta.fetch('issueNumber', nil),
                        'firstPage' => meta.fetch('pageStart', nil),
                        'lastPage' => meta.fetch('pageEnd', nil)
                      }.compact
                    elsif %w[BlogPosting Article].include?(schema_org)
                      issn = meta.dig('isPartOf', 'issn')
                      url = meta.dig('publisher', 'url')

                      {
                        'type' => 'Periodical',
                        'title' => meta.dig('isPartOf', 'name'),
                        'identifier' => issn.presence || url.presence,
                        'identifierType' => issn.present? ? 'ISSN' : 'URL'
                      }.compact
                    else
                      {}
                    end

        # treat these relationships as references
        references = (Array.wrap(meta.fetch('citation',
                                            nil)) + Array.wrap(meta.fetch('isBasedOn',
                                                                          nil))).map do |r|
          schema_org_reference(r)
        end

        rights_uri = parse_attributes(meta.dig('license'), content: 'id') || meta.dig('license')
        license = hsh_to_spdx('rightsURI' => rights_uri)

        funding_references = Array.wrap(meta.fetch('funder', nil)).compact.map do |fr|
          if fr['@id'].present?
            {
              'funderName' => fr['name'],
              'funderIdentifier' => fr['@id'],
              'funderIdentifierType' => fr['@id'].to_s.start_with?('https://doi.org/10.13039') ? 'Crossref Funder ID' : 'Other'
            }.compact
          else
            { 'funderName' => fr['name'] }.compact
          end
        end

        # strip milliseconds from iso8601, as edtf library doesn't handle them
        date = {}
        if Date.edtf(strip_milliseconds(meta.fetch('datePublished', nil))).present?
          date['published'] = strip_milliseconds(meta.fetch('datePublished'))
        end
        if Date.edtf(strip_milliseconds(meta.fetch('dateCreated', nil))).present?
          date['created'] = strip_milliseconds(meta.fetch('dateCreated'))
        end
        if Date.edtf(strip_milliseconds(meta.fetch('dateModified', nil))).present?
          date['updated'] = strip_milliseconds(meta.fetch('dateModified'))
        end

        language = case meta.fetch('inLanguage', nil)
                   when String
                     meta.fetch('inLanguage')
                   when Array
                     meta.fetch('inLanguage').first
                   when Object
                     meta.dig('inLanguage', 'alternateName') || meta.dig('inLanguage', 'name')
                   end

        state = meta.present? || read_options.present? ? 'findable' : 'not_found'
        geo_locations = Array.wrap(meta.fetch('spatialCoverage', nil)).map do |gl|
          if gl.dig('geo', 'box')
            s, w, n, e = gl.dig('geo', 'box').split(' ', 4)
            geo_location_box = {
              'westBoundLongitude' => w,
              'eastBoundLongitude' => e,
              'southBoundLatitude' => s,
              'northBoundLatitude' => n
            }.compact.presence
          else
            geo_location_box = nil
          end
          geo_location_point = { 'pointLongitude' => gl.dig('geo', 'longitude'),
                                 'pointLatitude' => gl.dig('geo', 'latitude') }.compact.presence

          {
            'geoLocationPlace' => gl.dig('geo', 'address'),
            'geoLocationPoint' => geo_location_point,
            'geoLocationBox' => geo_location_box
          }.compact
        end

        # handle keywords as array and as comma-separated string
        subjects = meta.fetch('keywords', nil)
        subjects = subjects.to_s.downcase.split(', ') if subjects.is_a?(String)
        subjects = Array.wrap(subjects).reduce([]) do |sum, subject|
          sum += name_to_fos(subject)
          sum
        end

        schema_version = meta.fetch('schemaVersion', nil).to_s.presence || 'http://datacite.org/schema/kernel-4'

        { 'id' => id,
          'type' => type,
          'additional_type' => additional_type,
          'alternate_identifiers' => alternate_identifiers.presence,
          'url' => normalize_id(meta.fetch('url', nil)),
          'content_url' => Array.wrap(meta.fetch('contentUrl', nil)),
          'sizes' => Array.wrap(meta.fetch('contenSize', nil)),
          'formats' => Array.wrap(meta.fetch('encodingFormat',
                                             nil) || meta.fetch('fileFormat', nil)),
          'titles' => if meta.fetch('name', nil).present?
                        [{ 'title' => meta.fetch('name', nil) }]
                      else
                        [{ 'title' => meta.fetch('headline', nil) }]
                      end,
          'creators' => creators,
          'contributors' => contributors,
          'publisher' => { 'name' => publisher },
          'provider' => parse_attributes(meta.fetch('provider', nil), content: 'name', first: true),
          'container' => container,
          'references' => references,
          'date' => date,
          'descriptions' => if meta.fetch('description', nil).present?
                              [{ 'description' => sanitize(meta.fetch('description')),
                                 'descriptionType' => 'Abstract' }]
                            end,
          'license' => license.presence,
          'version' => meta.fetch('version', nil).to_s.presence,
          'subjects' => subjects,
          'language' => language,
          'state' => state,
          'schema_version' => schema_version,
          'funding_references' => funding_references,
          'geo_locations' => geo_locations }.compact.merge(read_options)
      end

      # use separate fields for doi and url. Auto-generate key from doi or url
      def schema_org_reference(reference)
        id = normalize_id(reference.fetch('@id', nil))
        doi = doi_from_url(id)
        url = doi ? nil : normalize_url(id)

        { 'key' => id, 'doi' => doi, 'url' => url }.compact
      end

      def schema_org_is_identical_to(meta)
        schema_org_related_identifier(meta, relation_type: 'sameAs')
      end

      def schema_org_is_part_of(meta)
        schema_org_related_identifier(meta, relation_type: 'isPartOf')
      end

      def schema_org_has_part(meta)
        schema_org_related_identifier(meta, relation_type: 'hasPart')
      end

      def schema_org_is_previous_version_of(meta)
        schema_org_related_identifier(meta, relation_type: 'PredecessorOf')
      end

      def schema_org_is_new_version_of(meta)
        schema_org_related_identifier(meta, relation_type: 'SuccessorOf')
      end

      def schema_org_references(meta)
        schema_org_related_identifier(meta, relation_type: 'citation')
      end

      def schema_org_is_referenced_by(meta)
        schema_org_reverse_related_identifier(meta, relation_type: 'citation')
      end

      def schema_org_is_supplement_to(meta)
        schema_org_reverse_related_identifier(meta, relation_type: 'isBasedOn')
      end

      def schema_org_is_supplemented_by(meta)
        schema_org_related_identifier(meta, relation_type: 'isBasedOn')
      end
    end
  end
end