# frozen_string_literal: true module Bolognese module Readers module DataciteReader def get_datacite(id: nil, **options) return { "string" => nil, "state" => "not_found" } unless id.present? api_url = doi_api_url(id, options) response = Maremma.get(api_url) attributes = response.body.dig("data", "attributes") return { "string" => nil, "state" => "not_found" } unless attributes.present? string = attributes.fetch('xml', nil) string = Base64.decode64(string) if string.present? if string.present? doc = Nokogiri::XML(string, nil, 'UTF-8', &:noblanks) # remove leading and trailing whitespace in text nodes doc.xpath("//text()").each do |node| if node.content =~ /\S/ node.content = node.content.strip else node.remove end end string = doc.to_xml(:indent => 2) end client = Array.wrap(response.body.fetch("included", nil)).find { |m| m["type"] == "clients" } client_id = client.to_h.fetch("id", nil) provider_id = Array.wrap(client.to_h.fetch("relationships", nil)).find { |m| m["provider"].present? }.to_h.dig("provider", "data", "id") content_url = attributes.fetch("contentUrl", nil) || Array.wrap(response.body.fetch("included", nil)).select { |m| m["type"] == "media" }.map do |m| m.dig("attributes", "url") end.compact { "string" => string, "url" => attributes.fetch("url", nil), "state" => attributes.fetch("state", nil), "date_registered" => attributes.fetch("registered", nil), "date_updated" => attributes.fetch("updated", nil), "provider_id" => provider_id, "client_id" => client_id, "content_url" => content_url } end def read_datacite(string: nil, **options) read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url, :sandbox, :validate, :ra)) doc = Nokogiri::XML(string, nil, 'UTF-8', &:noblanks) if read_options.present? schema_version = "http://datacite.org/schema/kernel-4" else ns = doc.collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") } schema_version = Array.wrap(ns).last || "http://datacite.org/schema/kernel-4" end doc.remove_namespaces! string = doc.to_xml(:indent => 2) meta = Maremma.from_xml(string).to_h.fetch("resource", {}) # validate only when option is set, as this step is expensive and # not needed if XML comes from DataCite MDS if options[:validate] errors = datacite_errors(xml: string, schema_version: schema_version) return { "errors" => errors } if errors.present? end if options[:doi] id = normalize_doi(options[:doi], sandbox: options[:sandbox]) else id = normalize_doi(meta.dig("identifier", "__content__") || options[:id], sandbox: options[:sandbox]) end identifiers = Array.wrap(meta.dig("alternateIdentifiers", "alternateIdentifier")).map do |r| if r["__content__"].present? { "identifierType" => get_identifier_type(r["alternateIdentifierType"]), "identifier" => r["__content__"] } end end.compact resource_type_general = meta.dig("resourceType", "resourceTypeGeneral") resource_type = meta.dig("resourceType", "__content__") schema_org = Bolognese::Utils::CR_TO_SO_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::DC_TO_SO_TRANSLATIONS[resource_type_general.to_s.dasherize] || "CreativeWork" types = { "resourceTypeGeneral" => resource_type_general, "resourceType" => resource_type, "schemaOrg" => schema_org, "citeproc" => Bolognese::Utils::CR_TO_CP_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || "article", "bibtex" => Bolognese::Utils::CR_TO_BIB_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || "misc", "ris" => Bolognese::Utils::CR_TO_RIS_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::DC_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || "GEN" }.compact titles = get_titles(meta) publisher = Array.wrap(meta.dig("publisher")).map do |r| if r.blank? nil elsif r.is_a?(String) { "name" => r.strip } elsif r.is_a?(Hash) { "name" => r["__content__"].present? ? r["__content__"].strip : nil, "publisherIdentifier" => r["publisherIdentifierScheme"] == "ROR" ? normalize_ror(r["publisherIdentifier"]) : r["publisherIdentifier"], "publisherIdentifierScheme" => r["publisherIdentifierScheme"], "schemeUri" => r["schemeURI"], "lang" => r["lang"], }.compact end end.compact.first descriptions = Array.wrap(meta.dig("descriptions", "description")).map do |r| if r.blank? nil elsif r.is_a?(String) { "description" => sanitize(r, new_line: true), "descriptionType" => "Abstract" } elsif r.is_a?(Hash) { "description" => sanitize(r["__content__"], new_line: true), "descriptionType" => r["descriptionType"], "lang" => r["lang"] }.compact end end.compact rights_list = Array.wrap(meta.dig("rightsList", "rights")).map do |r| if r.blank? nil elsif r.is_a?(String) name_to_spdx(r) elsif r.is_a?(Hash) hsh_to_spdx(r) end end.compact subjects = Array.wrap(meta.dig("subjects", "subject")).reduce([]) do |sum, subject| if subject.is_a?(String) sum += name_to_fos(subject) elsif subject.is_a?(Hash) sum += hsh_to_fos(subject) end sum end.uniq dates = Array.wrap(meta.dig("dates", "date")).map do |r| if r.is_a?(Hash) && date = sanitize(r["__content__"]).presence if Date.edtf(date).present? || Bolognese::Utils::UNKNOWN_INFORMATION.key?(date) { "date" => date, "dateType" => parse_attributes(r, content: "dateType"), "dateInformation" => parse_attributes(r, content: "dateInformation") }.compact end end end.compact dates << { "date" => meta.fetch("publicationYear", nil), "dateType" => "Issued" } if meta.fetch("publicationYear", nil).present? && get_date(dates, "Issued").blank? sizes = Array.wrap(meta.dig("sizes", "size")).map do |k| if k.blank? nil elsif k.is_a?(String) sanitize(k).presence elsif k.is_a?(Hash) sanitize(k["__content__"]).presence end end.compact formats = Array.wrap(meta.dig("formats", "format")).map do |k| if k.blank? nil elsif k.is_a?(String) sanitize(k).presence elsif k.is_a?(Hash) sanitize(k["__content__"]).presence end end.compact .map { |s| s.to_s.squish.presence }.compact funding_references = Array.wrap(meta.dig("fundingReferences", "fundingReference")).compact.map do |fr| scheme_uri = parse_attributes(fr["funderIdentifier"], content: "schemeURI") funder_identifier = parse_attributes(fr["funderIdentifier"]) funder_identifier_type = parse_attributes(fr["funderIdentifier"], content: "funderIdentifierType") if funder_identifier_type == "Crossref Funder ID" funder_identifier = validate_funder_doi(funder_identifier) elsif funder_identifier_type == "ROR" funder_identifier = normalize_ror(funder_identifier) scheme_uri = "https://ror.org" else funder_identifier = normalize_id(funder_identifier) ? normalize_id(funder_identifier) : funder_identifier end { "funderName" => fr["funderName"], "funderIdentifier" => funder_identifier, "funderIdentifierType" => funder_identifier_type, "schemeUri" => scheme_uri, "awardNumber" => parse_attributes(fr["awardNumber"]), "awardUri" => parse_attributes(fr["awardNumber"], content: "awardURI"), "awardTitle" => fr["awardTitle"] }.compact end related_identifiers = Array.wrap(meta.dig("relatedIdentifiers", "relatedIdentifier")).map do |ri| if ri["relatedIdentifierType"] == "DOI" rid = validate_doi(ri["__content__"].to_s.downcase) else rid = ri["__content__"] end { "relatedIdentifier" => rid, "relatedIdentifierType" => ri["relatedIdentifierType"], "relationType" => ri["relationType"], "resourceTypeGeneral" => ri["resourceTypeGeneral"], "relatedMetadataScheme" => ri["relatedMetadataScheme"], "schemeUri" => ri["schemeURI"], "schemeType" => ri["schemeType"] }.compact end related_items = Array.wrap(meta.dig("relatedItems", "relatedItem")).map do |ri| rii = ri["relatedItemIdentifier"] relatedItemIdentifier = nil if rii if rii["relatedItemIdentifierType"] == "DOI" rid = validate_doi(rii["__content__"].to_s.downcase) else rid = rii["__content__"] end relatedItemIdentifier = { "relatedItemIdentifier" => rid, "relatedItemIdentifierType" => rii["relatedItemIdentifierType"], "relatedMetadataScheme" => rii["relatedMetadataScheme"], "schemeURI" => rii["schemeURI"], "schemeType" => rii["schemeType"] }.compact end number = ri["number"] if number.is_a?(String) number = number numberType = nil else number = ri.dig("number", "__content__") numberType = ri.dig("number", "numberType") end a = { "relationType" => ri["relationType"], "relatedItemType" => ri["relatedItemType"], "relatedItemIdentifier" => relatedItemIdentifier, "creators" => get_authors(Array.wrap(ri.dig("creators", "creator"))), "titles" => get_titles(ri), "publicationYear" => ri["publicationYear"], "volume" => parse_attributes(ri["volume"]), "issue" => parse_attributes(ri["issue"]), "number" => number, "numberType" => numberType, "firstPage" => parse_attributes(ri["firstPage"]), "lastPage" => parse_attributes(ri["lastPage"]), "publisher" => parse_attributes(ri["publisher"]), "edition" => parse_attributes(ri["edition"]), "contributors" => get_authors(Array.wrap(ri.dig("contributors", "contributor"))), }.compact end geo_locations = Array.wrap(meta.dig("geoLocations", "geoLocation")).map do |gl| if !gl.is_a?(Hash) || gl["geoLocationPoint"].is_a?(String) || gl["geoLocationBox"].is_a?(String) || gl["geoLocationPolygon"].is_a?(String) nil else # Handle scenario where multiple geoLocationPolygons are allowed within a single geoLocation # we want to return an array if it's already an array (i.e. multiple geoLocationPolygons) # vs if it's singular just return the object # This is for backwards compatability to allow both scenarios. if gl.dig("geoLocationPolygon").kind_of?(Array) geoLocationPolygon = gl.dig("geoLocationPolygon").map do |glp| Array.wrap(glp.dig("polygonPoint")).map { |glpp| { "polygonPoint" => glpp } }.compact.presence end.compact.presence else geoLocationPolygon = Array.wrap(gl.dig("geoLocationPolygon", "polygonPoint")).map { |glp| { "polygonPoint" => glp } }.compact.presence end { "geoLocationPoint" => { "pointLatitude" => gl.dig("geoLocationPoint", "pointLatitude"), "pointLongitude" => gl.dig("geoLocationPoint", "pointLongitude") }.compact.presence, "geoLocationBox" => { "westBoundLongitude" => gl.dig("geoLocationBox", "westBoundLongitude"), "eastBoundLongitude" => gl.dig("geoLocationBox", "eastBoundLongitude"), "southBoundLatitude" => gl.dig("geoLocationBox", "southBoundLatitude"), "northBoundLatitude" => gl.dig("geoLocationBox", "northBoundLatitude") }.compact.presence, "geoLocationPolygon" => geoLocationPolygon, "geoLocationPlace" => parse_attributes(gl["geoLocationPlace"], first: true).to_s.strip.presence }.compact end end.compact state = id.present? || read_options.present? ? "findable" : "not_found" { "id" => id, "types" => types, "doi" => doi_from_url(id), "identifiers" => identifiers, "url" => options.fetch(:url, nil).to_s.strip.presence, "titles" => titles, "creators" => get_authors(Array.wrap(meta.dig("creators", "creator"))), "contributors" => get_authors(Array.wrap(meta.dig("contributors", "contributor"))), "container" => set_container(meta), "publisher" => publisher, "agency" => "datacite", "funding_references" => funding_references, "dates" => dates, "publication_year" => parse_attributes(meta.fetch("publicationYear", nil), first: true).to_s.strip.presence, "descriptions" => descriptions, "rights_list" => Array.wrap(rights_list), "version_info" => meta.fetch("version", nil).to_s.presence, "subjects" => subjects, "language" => parse_attributes(meta.fetch("language", nil), first: true).to_s.strip.presence, "geo_locations" => geo_locations, "related_identifiers" => related_identifiers, "related_items" => related_items, "formats" => formats, "sizes" => sizes, "schema_version" => schema_version, "state" => state }.merge(read_options) end def set_container(meta) series_information = Array.wrap(meta.dig("descriptions", "description")).find { |r| r["descriptionType"] == "SeriesInformation" }.to_h.fetch("__content__", nil) si = get_series_information(series_information) is_part_of = Array.wrap(meta.dig("relatedIdentifiers", "relatedIdentifier")).find { |ri| ri["relationType"] == "IsPartOf" }.to_h if si["title"].present? || is_part_of.present? { "type" => meta.dig("resourceType", "resourceTypeGeneral") == "Dataset" ? "DataRepository" : "Series", "identifier" => is_part_of["__content__"], "identifierType" => is_part_of["relatedIdentifierType"], "title" => si["title"], "volume" => si["volume"], "issue" => si["issue"], "firstPage" => si["firstPage"], "lastPage" => si["lastPage"] }.compact else {} end end def get_titles(meta) titles = Array.wrap(meta.dig("titles", "title")).map do |r| if r.blank? nil elsif r.is_a?(String) { "title" => sanitize(r) } else { "title" => sanitize(r["__content__"]), "titleType" => r["titleType"], "lang" => r["lang"] }.compact end end.compact titles end end end end