# frozen_string_literal: true module Commonmeta module Utils NORMALIZED_LICENSES = { "https://creativecommons.org/licenses/by/1.0" => "https://creativecommons.org/licenses/by/1.0/legalcode", "https://creativecommons.org/licenses/by/2.0" => "https://creativecommons.org/licenses/by/2.0/legalcode", "https://creativecommons.org/licenses/by/2.5" => "https://creativecommons.org/licenses/by/2.5/legalcode", "https://creativecommons.org/licenses/by/3.0" => "https://creativecommons.org/licenses/by/3.0/legalcode", "https://creativecommons.org/licenses/by/3.0/us" => "https://creativecommons.org/licenses/by/3.0/legalcode", "https://creativecommons.org/licenses/by/4.0" => "https://creativecommons.org/licenses/by/4.0/legalcode", "https://creativecommons.org/licenses/by-nc/1.0" => "https://creativecommons.org/licenses/by-nc/1.0/legalcode", "https://creativecommons.org/licenses/by-nc/2.0" => "https://creativecommons.org/licenses/by-nc/2.0/legalcode", "https://creativecommons.org/licenses/by-nc/2.5" => "https://creativecommons.org/licenses/by-nc/2.5/legalcode", "https://creativecommons.org/licenses/by-nc/3.0" => "https://creativecommons.org/licenses/by-nc/3.0/legalcode", "https://creativecommons.org/licenses/by-nc/4.0" => "https://creativecommons.org/licenses/by-nc/4.0/legalcode", "https://creativecommons.org/licenses/by-nd-nc/1.0" => "https://creativecommons.org/licenses/by-nd-nc/1.0/legalcode", "https://creativecommons.org/licenses/by-nd-nc/2.0" => "https://creativecommons.org/licenses/by-nd-nc/2.0/legalcode", "https://creativecommons.org/licenses/by-nd-nc/2.5" => "https://creativecommons.org/licenses/by-nd-nc/2.5/legalcode", "https://creativecommons.org/licenses/by-nd-nc/3.0" => "https://creativecommons.org/licenses/by-nd-nc/3.0/legalcode", "https://creativecommons.org/licenses/by-nd-nc/4.0" => "https://creativecommons.org/licenses/by-nd-nc/4.0/legalcode", "https://creativecommons.org/licenses/by-nc-sa/1.0" => "https://creativecommons.org/licenses/by-nc-sa/1.0/legalcode", "https://creativecommons.org/licenses/by-nc-sa/2.0" => "https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode", "https://creativecommons.org/licenses/by-nc-sa/2.5" => "https://creativecommons.org/licenses/by-nc-sa/2.5/legalcode", "https://creativecommons.org/licenses/by-nc-sa/3.0" => "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode", "https://creativecommons.org/licenses/by-nc-sa/4.0" => "https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode", "https://creativecommons.org/licenses/by-nd/1.0" => "https://creativecommons.org/licenses/by-nd/1.0/legalcode", "https://creativecommons.org/licenses/by-nd/2.0" => "https://creativecommons.org/licenses/by-nd/2.0/legalcode", "https://creativecommons.org/licenses/by-nd/2.5" => "https://creativecommons.org/licenses/by-nd/2.5/legalcode", "https://creativecommons.org/licenses/by-nd/3.0" => "https://creativecommons.org/licenses/by-nd/3.0/legalcode", "https://creativecommons.org/licenses/by-nd/4.0" => "https://creativecommons.org/licenses/by-nd/2.0/legalcode", "https://creativecommons.org/licenses/by-sa/1.0" => "https://creativecommons.org/licenses/by-sa/1.0/legalcode", "https://creativecommons.org/licenses/by-sa/2.0" => "https://creativecommons.org/licenses/by-sa/2.0/legalcode", "https://creativecommons.org/licenses/by-sa/2.5" => "https://creativecommons.org/licenses/by-sa/2.5/legalcode", "https://creativecommons.org/licenses/by-sa/3.0" => "https://creativecommons.org/licenses/by-sa/3.0/legalcode", "https://creativecommons.org/licenses/by-sa/4.0" => "https://creativecommons.org/licenses/by-sa/4.0/legalcode", "https://creativecommons.org/licenses/by-nc-nd/1.0" => "https://creativecommons.org/licenses/by-nc-nd/1.0/legalcode", "https://creativecommons.org/licenses/by-nc-nd/2.0" => "https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode", "https://creativecommons.org/licenses/by-nc-nd/2.5" => "https://creativecommons.org/licenses/by-nc-nd/2.5/legalcode", "https://creativecommons.org/licenses/by-nc-nd/3.0" => "https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode", "https://creativecommons.org/licenses/by-nc-nd/4.0" => "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode", "https://creativecommons.org/licenses/publicdomain" => "https://creativecommons.org/licenses/publicdomain/", "https://creativecommons.org/publicdomain/zero/1.0" => "https://creativecommons.org/publicdomain/zero/1.0/legalcode", } # source: https://www.bibtex.com/e/entry-types/ BIB_TO_CM_TRANSLATIONS = { "article" => "JournalArticle", "book" => "Book", "booklet" => "Book", "inbook" => "BookChapter", "inproceedings" => "ProceedingsArticle", "manual" => "Report", "mastersthesis" => "Dissertation", "misc" => "Other", "phdthesis" => "Dissertation", "proceedings" => "Proceedings", "techreport" => "Report", "unpublished" => "Manuscript", } CM_TO_BIB_TRANSLATIONS = { "Article" => "article", "Book" => "book", "BookChapter" => "inbook", "Dissertation" => "phdthesis", "JournalArticle" => "article", "Manuscript" => "unpublished", "Other" => "misc", "Proceedings" => "proceedings", "ProceedingsArticle" => "inproceedings", "Report" => "techreport", } # source: https://docs.citationstyles.org/en/stable/specification.html?highlight=book#appendix-iii-types CSL_TO_CM_TRANSLATIONS = { "article" => "Article", "article-journal" => "JournalArticle", "article-magazine" => "Article", "article-newspaper" => "Article", "bill" => "LegalDocument", "book" => "Book", "broadcast" => "Audiovisual", "chapter" => "BookChapter", "classic" => "Book", "collection" => "Collection", "dataset" => "Dataset", "document" => "Document", "entry" => "Entry", "entry-dictionary" => "Entry", "entry-encyclopedia" => "Entry", "event" => "Event", "figure" => "Figure", "graphic" => "Image", "hearing" => "LegalDocument", "interview" => "Document", "legal_case" => "LegalDocument", "legislation" => "LegalDocument", "manuscript" => "Manuscript", "map" => "Map", "motion_picture" => "Audiovisual", "musical_score" => "Document", "pamphlet" => "Document", "paper-conference" => "ProceedingsArticle", "patent" => "Patent", "performance" => "Performance", "periodical" => "Journal", "personal_communication" => "PersonalCommunication", "post" => "Post", "post-weblog" => "Article", "regulation" => "LegalDocument", "report" => "Report", "review" => "Review", "review-book" => "Review", "software" => "Software", "song" => "Audiovisual", "speech" => "Speech", "standard" => "Standard", "thesis" => "Dissertation", "treaty" => "LegalDocument", "webpage" => "WebPage", } CM_TO_CSL_TRANSLATIONS = { "Article" => "article", "JournalArticle" => "article-journal", "Book" => "book", "BookChapter" => "chapter", "Collection" => "collection", "Dataset" => "dataset", "Document" => "document", "Entry" => "entry", "Event" => "event", "Figure" => "figure", "Image" => "graphic", "LegalDocument" => "legal_case", "Manuscript" => "manuscript", "Map" => "map", "Audiovisual" => "motion_picture", "Patent" => "patent", "Performance" => "performance", "Journal" => "periodical", "PersonalCommunication" => "personal_communication", "Post" => "post", "Report" => "report", "Review" => "review", "Software" => "software", "Speech" => "speech", "Standard" => "standard", "Dissertation" => "thesis", "WebPage" => "webpage", } # source: http://api.crossref.org/types CR_TO_CM_TRANSLATIONS = { "BookChapter" => "BookChapter", "BookPart" => "BookPart", "BookSection" => "BookSection", "BookSeries" => "BookSeries", "BookSet" => "BookSet", "BookTrack" => "BookTrack", "Book" => "Book", "Component" => "Component", "Database" => "Database", "Dataset" => "Dataset", "Dissertation" => "Dissertation", "EditedBook" => "EditedBook", "Grant" => "Grant", "JournalArticle" => "JournalArticle", "JournalIssue" => "JournalIssue", "JournalVolume" => "JournalVolume", "Journal" => "Journal", "Monograph" => "Book", "Other" => "Other", "PeerReview" => "PeerReview", "PostedContent" => "Article", "ProceedingsArticle" => "ProceedingsArticle", "ProceedingsSeries" => "ProceedingsSeries", "Proceedings" => "Proceedings", "ReferenceBook" => "ReferenceBook", "ReferenceEntry" => "Entry", "ReportComponent" => "ReportComponent", "ReportSeries" => "ReportSeries", "Report" => "Report", "Standard" => "Standard", } CM_TO_CR_TRANSLATIONS = { "Article" => "PostedContent", "BookChapter" => "BookChapter", "BookSeries" => "BookSeries", "Book" => "Book", "Component" => "Component", "Dataset" => "Dataset", "Dissertation" => "Dissertation", "Grant" => "Grant", "JournalArticle" => "JournalArticle", "JournalIssue" => "JournalIssue", "JournalVolume" => "JournalVolume", "Journal" => "Journal", "ProceedingsArticle" => "ProceedingsArticle", "ProceedingsSeries" => "ProceedingsSeries", "Proceedings" => "Proceedings", "ReportComponent" => "ReportComponent", "ReportSeries" => "ReportSeries", "Report" => "Report", "PeerReview" => "PeerReview", "Other" => "Other", } # source: https://github.com/datacite/schema/blob/master/source/meta/kernel-4/include/datacite-resourceType-v4.xsd DC_TO_CM_TRANSLATIONS = { "Audiovisual" => "Audiovisual", "BlogPosting" => "Article", "Book" => "Book", "BookChapter" => "BookChapter", "Collection" => "Collection", "ComputationalNotebook" => "ComputationalNotebook", "ConferencePaper" => "ProceedingsArticle", "ConferenceProceeding" => "Proceedings", "DataPaper" => "JournalArticle", "Dataset" => "Dataset", "Dissertation" => "Dissertation", "Event" => "Event", "Image" => "Image", "InteractiveResource" => "InteractiveResource", "Journal" => "Journal", "JournalArticle" => "JournalArticle", "Model" => "Model", "OutputManagementPlan" => "OutputManagementPlan", "PeerReview" => "PeerReview", "PhysicalObject" => "PhysicalObject", "Poster" => "Speech", "Preprint" => "Article", "Report" => "Report", "Service" => "Service", "Software" => "Software", "Sound" => "Sound", "Standard" => "Standard", "Text" => "Document", "Thesis" => "Dissertation", "Workflow" => "Workflow", "Other" => "Other", } CM_TO_DC_TRANSLATIONS = { "Article" => "Preprint", "Audiovisual" => "Audiovisual", "Book" => "Book", "BookChapter" => "BookChapter", "Collection" => "Collection", "Dataset" => "Dataset", "Dissertation" => "Dissertation", "Document" => "Text", "Entry" => "Text", "Event" => "Event", "Figure" => "Image", "Image" => "Image", "JournalArticle" => "JournalArticle", "LegalDocument" => "Text", "Manuscript" => "Text", "Map" => "Image", "Patent" => "Text", "Performance" => "Audiovisual", "PersonalCommunication" => "Text", "Post" => "Text", "ProceedingsArticle" => "ConferencePaper", "Proceedings" => "ConferenceProceeding", "Report" => "Report", "PeerReview" => "PeerReview", "Software" => "Software", "Sound" => "Sound", "Standard" => "Standard", "WebPage" => "Text", } RIS_TO_CM_TRANSLATIONS = { "ABST" => "Text", "ADVS" => "Text", "AGGR" => "Text", "ANCIENT" => "Text", "ART" => "Text", "BILL" => "Text", "BLOG" => "Text", "BOOK" => "Book", "CASE" => "Text", "CHAP" => "BookChapter", "CHART" => "Text", "CLSWK" => "Text", "CTLG" => "Collection", "COMP" => "Software", "DATA" => "Dataset", "DBASE" => "Database", "DICT" => "Dictionary", "EBOOK" => "Book", "ECHAP" => "BookChapter", "EDBOOK" => "Book", "EJOUR" => "JournalArticle", "ELEC" => "Text", "ENCYC" => "Encyclopedia", "EQUA" => "Equation", "FIGURE" => "Image", "GEN" => "CreativeWork", "GOVDOC" => "GovernmentDocument", "GRANT" => "Grant", "HEAR" => "Hearing", "ICOMM" => "Text", "INPR" => "Text", "JFULL" => "JournalArticle", "JOUR" => "JournalArticle", "LEGAL" => "LegalRuleOrRegulation", "MANSCPT" => "Text", "MAP" => "Map", "MGZN" => "MagazineArticle", "MPCT" => "Audiovisual", "MULTI" => "Audiovisual", "MUSIC" => "MusicScore", "NEWS" => "NewspaperArticle", "PAMP" => "Pamphlet", "PAT" => "Patent", "PCOMM" => "PersonalCommunication", "RPRT" => "Report", "SER" => "SerialPublication", "SLIDE" => "Slide", "SOUND" => "SoundRecording", "STAND" => "Standard", "THES" => "Dissertation", "UNBILL" => "UnenactedBill", "UNPB" => "UnpublishedWork", "VIDEO" => "Audiovisual", "WEB" => "WebPage", } CM_TO_RIS_TRANSLATIONS = { "Article" => "JOUR", "Audiovisual" => "VIDEO", "Book" => "BOOK", "BookChapter" => "CHAP", "Collection" => "CTLG", "Dataset" => "DATA", "Dissertation" => "THES", "Document" => "GEN", "Entry" => "DICT", "Event" => "GEN", "Figure" => "FIGURE", "Image" => "FIGURE", "JournalArticle" => "JOUR", "LegalDocument" => "GEN", "Manuscript" => "GEN", "Map" => "MAP", "Patent" => "PAT", "Performance" => "GEN", "PersonalCommunication" => "PCOMM", "Post" => "GEN", "ProceedingsArticle" => "CPAPER", "Proceedings" => "CONF", "Report" => "RPRT", "Review" => "GEN", "Software" => "COMP", "Sound" => "SOUND", "Standard" => "STAND", "WebPage" => "WEB", } SO_TO_CM_TRANSLATIONS = { "Article" => "Article", "BlogPosting" => "Article", "Book" => "Book", "BookChapter" => "BookChapter", "CreativeWork" => "Other", "Dataset" => "Dataset", "Dissertation" => "Dissertation", "NewsArticle" => "Article", "Legislation" => "LegalDocument", "ScholarlyArticle" => "JournalArticle", "SoftwareSourceCode" => "Software", } CM_TO_SO_TRANSLATIONS = { "Article" => "Article", "Audiovisual" => "CreativeWork", "Book" => "Book", "BookChapter" => "BookChapter", "Collection" => "CreativeWork", "Dataset" => "Dataset", "Dissertation" => "Dissertation", "Document" => "CreativeWork", "Entry" => "CreativeWork", "Event" => "CreativeWork", "Figure" => "CreativeWork", "Image" => "CreativeWork", "JournalArticle" => "ScholarlyArticle", "LegalDocument" => "Legislation", "Software" => "SoftwareSourceCode", } CM_TO_JATS_TRANSLATIONS = { "Proceedings" => "working-paper", "ReferenceBook" => "book", "JournalIssue" => "journal", "ProceedingsArticle" => "working-paper", "Other" => nil, "Dissertation" => nil, "Dataset" => "data", "Document" => "journal", "EditedBook" => "book", "JournalArticle" => "journal", "Journal" => "journal", "Report" => "report", "BookSeries" => "book", "ReportSeries" => "report", "BookTrack" => "book", "Standard" => "standard", "BookSection" => "chapter", "BookPart" => "chapter", "Book" => "book", "BookChapter" => "chapter", "StandardSeries" => "standard", "Monograph" => "book", "Component" => nil, "ReferenceEntry" => nil, "JournalVolume" => "journal", "BookSet" => "book", "Article" => "journal", "Software" => "software", } UNKNOWN_INFORMATION = { ":unac" => "temporarily inaccessible", ":unal" => "unallowed, suppressed intentionally", ":unap" => "not applicable, makes no sense", ":unas" => "value unassigned (e.g., Untitled)", ":unav" => "value unavailable, possibly unknown", ":unkn" => "known to be unknown (e.g., Anonymous, Inconnue)", ":none" => "never had a value, never will", ":null" => "explicitly and meaningfully empty", ":tba" => "to be assigned or announced later", ":etal" => "too numerous to list (et alia)", } def find_from_format(id: nil, string: nil, ext: nil, filename: nil) if id.present? find_from_format_by_id(id) elsif string.present? && ext.present? find_from_format_by_ext(string, ext: ext) elsif string.present? find_from_format_by_string(string) elsif filename.present? find_from_format_by_filename(filename) else "datacite" end end def find_from_format_by_id(id) id = normalize_id(id) if %r{\A(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org)/)?(doi:)?(10\.\d{4,5}/.+)\z}.match?(id) ra = get_doi_ra(id) %w[DataCite Crossref mEDRA KISTI JaLC OP].include?(ra) ? ra.downcase : nil elsif %r{\A(?:(http|https):/(/)?orcid\.org/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z}.match?(id) "orcid" elsif %r{\A(http|https):/(/)?github\.com/(.+)/package.json\z}.match?(id) "npm" elsif %r{\A(http|https):/(/)?github\.com/(.+)/codemeta.json\z}.match?(id) "codemeta" elsif %r{\A(http|https):/(/)?github\.com/(.+)/CITATION.cff\z}.match?(id) "cff" elsif %r{\A(http|https):/(/)?github\.com/(.+)\z}.match?(id) "cff" elsif %r{\A(http|https):/(/)?rogue-scholar\.org/api/posts/(.+)\z}.match?(id) "json_feed_item" else "schema_org" end end def find_from_format_by_filename(filename) if filename == "package.json" "npm" elsif filename == "CITATION.cff" "cff" end end def find_from_format_by_ext(string, options = {}) case options[:ext] when ".bib" "bibtex" when ".ris" "ris" when ".xml", ".json" find_from_format_by_string(string) end end def find_from_format_by_string(string) begin # try to parse as JSON hsh = MultiJson.load(string).to_h if hsh.dig("@context") && URI.parse(hsh.dig("@context")).host == "schema.org" return "schema_org" elsif hsh.dig("schemaVersion").to_s.start_with?("http://datacite.org/schema/kernel") return "datacite" elsif hsh.dig("source") == "Crossref" return "crossref" elsif hsh.dig("issued", "date-parts").present? return "csl" elsif URI.parse(hsh.dig("@context")).to_s == "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld" return "codemeta" end rescue MultiJson::ParseError end begin # try to parse as XML hsh = Hash.from_xml(string) return "crossref_xml" if hsh.to_h.dig("crossref_result").present? rescue Nokogiri::XML::SyntaxError end begin # try to parse as YAML hsh = YAML.load(string, permitted_classes: [Date]) return "cff" if hsh.is_a?(Hash) && hsh.fetch("cff-version", nil).present? rescue Psych::SyntaxError end if string.start_with?("TY - ") "ris" elsif BibTeX.parse(string).first "bibtex" end end def orcid_from_url(url) Array(%r{\A:(http|https)://orcid\.org/(.+)}.match(url)).last end def orcid_as_url(orcid) "https://orcid.org/#{orcid}" if orcid.present? end def validate_orcid(orcid) orcid = Array(%r{\A(?:(?:http|https)://(?:(?:www|sandbox)?\.)?orcid\.org/)?(\d{4}[[:space:]-]\d{4}[[:space:]-]\d{4}[[:space:]-]\d{3}[0-9X]+)\z}.match(orcid)).last orcid.gsub(/[[:space:]]/, "-") if orcid.present? end def validate_orcid_scheme(orcid_scheme) Array(%r{\A(http|https)://(www\.)?(orcid\.org)}.match(orcid_scheme)).last end def validate_url(str) if %r{\A(?:(http|https)://(dx\.)?doi.org/)?(doi:)?(10\.\d{4,5}/.+)\z}.match?(str) "DOI" elsif %r{\A(http|https)://}.match?(str) "URL" elsif /\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\z/.match?(str) "ISSN" end end def validate_email(str) email_regex = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i parsed_emails = str.scan(email_regex) parsed_emails.first end def parse_attributes(element, options = {}) content = options[:content] || "__content__" if element.is_a?(String) && options[:content].nil? CGI.unescapeHTML(element) elsif element.is_a?(Hash) element.fetch(CGI.unescapeHTML(content), nil) elsif element.is_a?(Array) a = element.map { |e| e.is_a?(Hash) ? e.fetch(CGI.unescapeHTML(content), nil) : e }.uniq a = options[:first] ? a.first : a.unwrap end end def normalize_id(id, options = {}) return nil unless id.present? # check for valid DOI doi = normalize_doi(id, options) return doi if doi.present? # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w[http https].include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end def normalize_url(id, options = {}) return nil unless id.present? # handle info URIs return id if id.to_s.start_with?("info") # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w[http https ftp].include?(uri.scheme) # optionally turn into https URL uri.scheme = "https" if options[:https] # clean up URL uri.path = PostRank::URI.clean(uri.path) # optionally remove query and fragment if options[:remove_query_string] uri.query = nil uri.fragment = nil end uri.to_s rescue Addressable::URI::InvalidURIError nil end def normalize_cc_url(id) id = normalize_url(id, https: true) NORMALIZED_LICENSES.fetch(id, id) end def normalize_orcid(orcid) orcid = validate_orcid(orcid) return nil unless orcid.present? # turn ORCID ID into URL "https://orcid.org/" + Addressable::URI.encode(orcid) end # pick electronic issn if there are multiple # format issn as xxxx-xxxx def normalize_issn(input, options = {}) content = options[:content] || "__content__" issn = if input.blank? nil elsif input.is_a?(String) && options[:content].nil? input elsif input.is_a?(Hash) input.fetch(content, nil) elsif input.is_a?(Array) a = input.find { |a| a["media_type"] == "electronic" } || input.first a.fetch(content, nil) end case issn.to_s.length when 9 issn when 8 issn[0..3] + "-" + issn[4..7] end end # find Creative Commons or OSI license in licenses array, normalize url and name def normalize_licenses(licenses) standard_licenses = Array.wrap(licenses).map do |l| URI.parse(l["url"]) end.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] } return licenses unless standard_licenses.present? # use HTTPS uri.scheme = "https" # use host name without subdomain uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last # normalize URLs if uri.host == "creativecommons.org" uri.path = uri.path.split("/")[0..-2].join("/") if uri.path.split("/").last == "legalcode" uri.path << "/" unless uri.path.end_with?("/") else uri.path = uri.path.gsub(/(-license|\.php|\.html)/, "") uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase } uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize } uri.path = uri.path.sub(/([^0-9-]+)(-)?([1-9])?(\.)?([0-9])?$/) do m = Regexp.last_match text = m[1] if m[3].present? version = [m[3], m[5].presence || "0"].join(".") [text, version].join("-") else text end end end uri.to_s rescue URI::InvalidURIError nil end def to_datacite(element, options = {}) a = Array.wrap(element).map do |e| e.each_with_object({}) do |(k, v), h| h[k.dasherize] = v end end options[:first] ? a.unwrap : a.presence end def from_datacite(element) mapping = { "nameType" => "type", "creatorName" => "name" } map_hash_keys(element: element, mapping: mapping) end def to_schema_org(element) mapping = { "type" => "@type", "id" => "@id", "title" => "name" } map_hash_keys(element: element, mapping: mapping) end def to_schema_org_container(element, options = {}) return nil unless element.is_a?(Hash) || (element.nil? && options[:container_title].present?) issn = element["identifier"] if element["identifierType"] == "ISSN" id = issn.blank? ? element["identifier"] : nil name = options[:container_title] || element["title"] type = id || name ? options[:type] || element["type"] : nil { "@id" => id, "@type" => type, "name" => name, "issn" => issn }.compact end def to_schema_org_identifiers(element, _options = {}) Array.wrap(element).map do |ai| { "@type" => "PropertyValue", "propertyID" => ai["alternateIdentifierType"], "value" => ai["alternateIdentifier"], } end.unwrap end def to_schema_org_relation(related_identifiers: nil, relation_type: nil) return nil unless related_identifiers.present? && relation_type.present? relation_type = if relation_type == "References" %w[References Cites Documents] else [relation_type] end Array.wrap(related_identifiers).select do |ri| relation_type.include?(ri["relationType"]) end.map do |r| if r["relatedIdentifierType"] == "ISSN" && r["relationType"] == "IsPartOf" { "@type" => "Periodical", "issn" => r["relatedIdentifier"] }.compact else { "@id" => normalize_id(r["relatedIdentifier"]), "@type" => DC_TO_SO_TRANSLATIONS[r["resourceTypeGeneral"]] || "CreativeWork", }.compact end end.unwrap end def to_schema_org_funder(funding_references) return nil unless funding_references.present? Array.wrap(funding_references).map do |fr| { "@id" => fr["funderIdentifier"], "@type" => "Organization", "name" => fr["funderName"], }.compact end.unwrap end def to_schema_org_citation(reference) return nil unless reference.present? { "@type" => "CreativeWork", "@id" => reference["doi"] ? normalize_id(reference["doi"]) : nil, "name" => reference["title"], "datePublished" => reference["publicationYear"], }.compact end def to_schema_org_spatial_coverage(geo_location) return nil unless geo_location.present? Array.wrap(geo_location).each_with_object([]) do |gl, sum| if gl.fetch("geoLocationPoint", nil) sum << { "@type" => "Place", "geo" => { "@type" => "GeoCoordinates", "address" => gl["geoLocationPlace"], "latitude" => gl.dig("geoLocationPoint", "pointLatitude"), "longitude" => gl.dig("geoLocationPoint", "pointLongitude"), }, }.compact end if gl.fetch("geoLocationBox", nil) sum << { "@type" => "Place", "geo" => { "@type" => "GeoShape", "address" => gl["geoLocationPlace"], "box" => [gl.dig("geoLocationBox", "southBoundLatitude"), gl.dig("geoLocationBox", "westBoundLongitude"), gl.dig("geoLocationBox", "northBoundLatitude"), gl.dig("geoLocationBox", "eastBoundLongitude")].compact.join(" ").presence, }.compact, }.compact end if gl.fetch("geoLocationPolygon", nil) sum << { "@type" => "Place", "geo" => { "@type" => "GeoShape", "address" => gl["geoLocationPlace"], "polygon" => Array.wrap(gl.dig("geoLocationPolygon")).map do |glp| Array.wrap(glp).map do |glpp| [glpp.dig("polygonPoint", "pointLongitude"), glpp.dig("polygonPoint", "pointLatitude")].compact end.compact end.compact.presence, }, } end next unless gl.fetch("geoLocationPlace", nil) && !gl.fetch("geoLocationPoint", nil) && !gl.fetch("geoLocationBox", nil) && !gl.fetch( "geoLocationPolygon", nil ) sum << { "@type" => "Place", "geo" => { "@type" => "GeoCoordinates", "address" => gl["geoLocationPlace"], }, }.compact end.unwrap end def from_schema_org(element) mapping = { "@type" => "type", "@id" => "id" } map_hash_keys(element: element, mapping: mapping) end def map_hash_keys(element: nil, mapping: nil) Array.wrap(element).map do |a| a.map { |k, v| [mapping.fetch(k, k), v] }.reduce({}) do |hsh, (k, v)| if k == "affiliation" && v.is_a?(Array) hsh[k] = v.map do |affiliation| if affiliation.is_a?(Hash) affiliation.merge("@type" => "Organization") else affiliation end end hsh elsif k == "type" && v.is_a?(String) hsh[k] = v.capitalize hsh elsif v.is_a?(Hash) hsh[k] = to_schema_org(v) hsh else hsh[k] = v hsh end end end.unwrap end def to_identifier(identifier) { "@type" => "PropertyValue", "propertyID" => identifier["relatedIdentifierType"], "value" => identifier["relatedIdentifier"], } end def from_json_feed(element) mapping = { "url" => "id" } map_hash_keys(element: element, mapping: mapping) end def from_csl(element) Array.wrap(element).map do |a| if a["literal"].present? a["type"] = "Organization" a["name"] = a["literal"] elsif a["name"].present? a["type"] = "Organization" elsif a["given"].present? || a["family"].present? a["type"] = "Person" end a["givenName"] = a["given"] a["familyName"] = a["family"] a.except("given", "family", "literal").compact end.unwrap end def to_csl(element) Array.wrap(element).map do |a| a["family"] = a["familyName"] a["given"] = a["givenName"] a["literal"] = a["name"] unless a["familyName"].present? a.except("nameType", "type", "@type", "id", "@id", "name", "familyName", "givenName", "affiliation", "contributorType").compact end.presence end def to_ris(element) Array.wrap(element).map do |a| if a["familyName"].present? [a["familyName"], a["givenName"]].join(", ") else a["name"] end end.unwrap end def sanitize(text, options = {}) options[:tags] ||= Set.new(%w[strong em b i code pre sub sup br]) content = options[:content] || "__content__" custom_scrubber = Commonmeta::WhitelistScrubber.new(options) if text.is_a?(String) # remove excessive internal whitespace with squish Loofah.scrub_fragment(text, custom_scrubber).to_s.squish elsif text.is_a?(Hash) sanitize(text.fetch(content, nil)) elsif text.is_a?(Array) a = text.map { |e| e.is_a?(Hash) ? sanitize(e.fetch(content, nil)) : sanitize(e) }.uniq a = options[:first] ? a.first : a.unwrap end end def github_from_url(url) return {} unless %r{\Ahttps://github\.com/(.+)(?:/)?(.+)?(?:/tree/)?(.*)\z}.match?(url) words = URI.parse(url).path[1..-1].split("/") path = words.length > 3 ? words[4...words.length].join("/") : nil { owner: words[0], repo: words[1], release: words[3], path: path }.compact end def github_repo_from_url(url) github_from_url(url).fetch(:repo, nil) end def github_release_from_url(url) github_from_url(url).fetch(:release, nil) end def github_owner_from_url(url) github_from_url(url).fetch(:owner, nil) end def github_as_owner_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present? end def github_as_repo_url(url) github_hash = github_from_url(url) return unless github_hash[:repo].present? "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}" end def github_as_release_url(url) github_hash = github_from_url(url) return unless github_hash[:release].present? "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}" end def github_as_codemeta_url(url) github_hash = github_from_url(url) if github_hash[:path].to_s.end_with?("codemeta.json") "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}" elsif github_hash[:owner].present? "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/master/codemeta.json" end end def github_as_cff_url(url) github_hash = github_from_url(url) if github_hash[:path].to_s.end_with?("CITATION.cff") "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}" elsif github_hash[:owner].present? "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/main/CITATION.cff" end end def get_date_parts(iso8601_time) return { "date-parts" => [[]] } if iso8601_time.nil? year = iso8601_time[0..3].to_i month = iso8601_time[5..6].to_i day = iso8601_time[8..9].to_i { "date-parts" => [[year, month, day].reject { |part| part == 0 }] } rescue TypeError nil end def get_date_from_date_parts(date_as_parts) date_parts = date_as_parts.fetch("date-parts", []).first return nil if date_parts == [nil] year = date_parts[0] month = date_parts[1] day = date_parts[2] get_date_from_parts(year, month, day) rescue NoMethodError # if date_parts is nil nil end def get_date_from_parts(year, month = nil, day = nil) [year.to_s.rjust(4, "0"), month.to_s.rjust(2, "0"), day.to_s.rjust(2, "0")].reject do |part| part == "00" end.join("-") end def get_date_parts_from_parts(year, month = nil, day = nil) { "date-parts" => [[year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }] } end def get_iso8601_date(iso8601_time) return nil if iso8601_time.nil? || iso8601_time.length < 4 case iso8601_time.length when 4 iso8601_time[0..3] when 7 iso8601_time[0..6] else iso8601_time[0..9] end end def get_year_month(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] [year.to_i, month.to_i].reject { |part| part == 0 } end def get_year_month_day(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] day = iso8601_time[8..9] [year.to_i, month.to_i, day.to_i].reject { |part| part == 0 } end # parsing of incomplete iso8601 timestamps such as 2015-04 is broken # in standard library, so we use the edtf gem # return nil if invalid iso8601 timestamp def get_datetime_from_iso8601(iso8601_time) Date.edtf(iso8601_time).to_time.utc rescue StandardError nil end # strip milliseconds if there is a time, as it interferes with edtc parsing # keep dates unchanged def strip_milliseconds(iso8601_time) return iso8601_time.split(" ").first if iso8601_time.to_s.include? " " return iso8601_time.split(".").first + "Z" if iso8601_time.to_s.include? "." return iso8601_time.split("+").first + "Z" if iso8601_time.to_s.include? "+" iso8601_time end # iso8601 datetime without hyphens and colons, used by Crossref # return nil if invalid def get_datetime_from_time(time) DateTime.strptime(time.to_s, "%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%SZ") rescue ArgumentError nil end def get_date(dates, date_type) dd = Array.wrap(dates).find { |d| d["dateType"] == date_type } || {} dd.fetch("date", nil) end def get_link(links, link_type) ll = Array.wrap(links).find { |d| d["rel"] == link_type } || {} ll.fetch("href", nil) end def rogue_scholar_api_url(id, _options = {}) "https://rogue-scholar.org/api/posts/#{id}" end # convert commonmeta dates to DataCite format def get_dates_from_date(date) return nil if date.nil? mapping = { "published" => "issued" } date = map_hash_keys(element: date, mapping: mapping) date.map do |k, v| { "date" => v, "dateType" => k.capitalize } end end def get_contributor(contributor, contributor_type) contributor.select { |c| c["contributorType"] == contributor_type } end def get_identifier(identifiers, identifier_type) id = Array.wrap(identifiers).find { |i| i["identifierType"] == identifier_type } || {} id.fetch("identifier", nil) end def get_identifier_type(identifier_type) return nil unless identifier_type.present? identifierTypes = { "ark" => "ARK", "arxiv" => "arXiv", "bibcode" => "bibcode", "doi" => "DOI", "ean13" => "EAN13", "eissn" => "EISSN", "handle" => "Handle", "igsn" => "IGSN", "isbn" => "ISBN", "issn" => "ISSN", "istc" => "ISTC", "lissn" => "LISSN", "lsid" => "LSID", "pmid" => "PMID", "purl" => "PURL", "upc" => "UPC", "url" => "URL", "urn" => "URN", "md5" => "md5", "minid" => "minid", "dataguid" => "dataguid", } identifierTypes[identifier_type.downcase] || identifier_type end def get_series_information(str) return {} unless str.present? str = str.split(",").map(&:strip) title = str.first volume_issue = str.length > 2 ? str[1].rpartition(/\(([^)]+)\)/) : nil volume = volume_issue.present? ? volume_issue[0].presence || volume_issue[2].presence : nil issue = volume_issue.present? ? volume_issue[1][1...-1].presence : nil pages = str.length > 1 ? str.last : nil first_page = pages.present? ? pages.split("-").map(&:strip)[0] : nil last_page = pages.present? ? pages.split("-").map(&:strip)[1] : nil { "title" => title, "volume" => volume, "issue" => issue, "firstPage" => first_page, "lastPage" => last_page, }.compact end def jsonlint(json) return ["No JSON provided"] unless json.present? error_array = [] linter = JsonLint::Linter.new linter.send(:check_data, json, error_array) error_array end def name_to_spdx(name) spdx = JSON.load(File.read(File.expand_path("../../resources/spdx/licenses.json", __dir__))).fetch("licenses") license = spdx.find do |l| l["name"] == name || l["licenseId"] == name || l["seeAlso"].first == normalize_cc_url(name) end if license { "id" => license["licenseId"], "url" => license["seeAlso"].first }.compact else { "rights" => name } end end def hsh_to_spdx(hsh) spdx = JSON.load(File.read(File.expand_path("../../resources/spdx/licenses.json", __dir__))).fetch("licenses") hsh["rightsUri"] = hsh.delete("rightsURI") if hsh["rightsUri"].blank? license = spdx.find do |l| l["licenseId"].casecmp?(hsh["rightsIdentifier"]) || l["seeAlso"].first == normalize_cc_url(hsh["rightsUri"]) || l["name"] == hsh["rights"] || l["seeAlso"].first == normalize_cc_url(hsh["rights"]) end if license { "id" => license["licenseId"], "url" => license["seeAlso"].first }.compact else { "id" => hsh["rightsIdentifier"].present? ? hsh["rightsIdentifier"].downcase : nil, "url" => hsh["rightsURI"] || hsh["rightsUri"], }.compact end end def spdx_to_hsh(hsh) return nil unless hsh.present? && hsh.is_a?(Hash) spdx = JSON.load(File.read(File.expand_path("../../resources/spdx/licenses.json", __dir__))).fetch("licenses") license = spdx.find { |l| l["licenseId"].casecmp?(hsh["id"]) } if license [{ "rightsIdentifier" => license["licenseId"].downcase, "rightsUri" => license["seeAlso"].first, "rights" => license["name"], "rightsIdentifierScheme" => "SPDX", "schemeUri" => "https://spdx.org/licenses/", }.compact] else [{ "rightsIdentifier" => hsh["id"], "rightsURI" => hsh["url"] }.compact] end end def name_to_fos(name) # first find subject in Fields of Science (OECD) fos = JSON.load(File.read(File.expand_path("../../resources/oecd/fos-mappings.json", __dir__))).fetch("fosFields") subject = fos.find { |l| l["fosLabel"] == name || "FOS: " + l["fosLabel"] == name } if subject return [{ "subject" => sanitize(name).downcase, }, { "subject" => "FOS: " + subject["fosLabel"], "subjectScheme" => "Fields of Science and Technology (FOS)", "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf", }] end # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification) # and map to Fields of Science. Add an extra entry for the latter fores = JSON.load(File.read(File.expand_path("../../resources/oecd/for-mappings.json", __dir__))) for_fields = fores.fetch("forFields") for_disciplines = fores.fetch("forDisciplines") subject = for_fields.find { |l| l["forLabel"] == name } || for_disciplines.find { |l| l["forLabel"] == name } if subject [{ "subject" => sanitize(name).downcase, }, { "subject" => "FOS: " + subject["fosLabel"], "subjectScheme" => "Fields of Science and Technology (FOS)", "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf", }] else [{ "subject" => sanitize(name).downcase }] end end def hsh_to_fos(hsh) # first find subject in Fields of Science (OECD) fos = JSON.load(File.read(File.expand_path("../../resources/oecd/fos-mappings.json", __dir__))).fetch("fosFields") subject = fos.find do |l| l["fosLabel"] == hsh["__content__"] || "FOS: " + l["fosLabel"] == hsh["__content__"] || l["fosLabel"] == hsh["subject"] end if subject return [{ "subject" => sanitize(hsh["__content__"] || hsh["subject"]), "subjectScheme" => hsh["subjectScheme"], "schemeUri" => hsh["schemeURI"] || hsh["schemeUri"], "valueUri" => hsh["valueURI"] || hsh["valueUri"], "classificationCode" => hsh["classificationCode"], "lang" => hsh["lang"], }.compact, { "subject" => "FOS: " + subject["fosLabel"], "subjectScheme" => "Fields of Science and Technology (FOS)", "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf", }.compact] end # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification) # and map to Fields of Science. Add an extra entry for the latter fores = JSON.load(File.read(File.expand_path("../../resources/oecd/for-mappings.json", __dir__))) for_fields = fores.fetch("forFields") for_disciplines = fores.fetch("forDisciplines") # try to extract forId if hsh["subjectScheme"] == "FOR" for_id = hsh["__content__"].to_s.split(" ").first || hsh["subject"].to_s.split(" ").first for_id = for_id.rjust(6, "0") subject = for_fields.find { |l| l["forId"] == for_id } || for_disciplines.find { |l| l["forId"] == for_id[0..3] } else subject = for_fields.find do |l| l["forLabel"] == hsh["__content__"] || l["forLabel"] == hsh["subject"] end || for_disciplines.find do |l| l["forLabel"] == hsh["__content__"] || l["forLabel"] == hsh["subject"] end end if subject [{ "subject" => sanitize(hsh["__content__"] || hsh["subject"]), "subjectScheme" => hsh["subjectScheme"], "classificationCode" => hsh["classificationCode"], "schemeUri" => hsh["schemeURI"] || hsh["schemeUri"], "valueUri" => hsh["valueURI"] || hsh["valueUri"], "lang" => hsh["lang"], }.compact, { "subject" => "FOS: " + subject["fosLabel"], "subjectScheme" => "Fields of Science and Technology (FOS)", "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf", }] else [{ "subject" => sanitize(hsh["__content__"] || hsh["subject"]), "subjectScheme" => hsh["subjectScheme"], "classificationCode" => hsh["classificationCode"], "schemeUri" => hsh["schemeURI"] || hsh["schemeUri"], "valueUri" => hsh["valueURI"] || hsh["valueUri"], "lang" => hsh["lang"], }.compact] end end def encode_doi(prefix, options = {}) # DOI suffix is a generated from a random number, encoded in base32 # suffix has 8 digits plus two checksum digits. With base32 there are # 32 possible digits, so 8 digits gives 32^8 possible combinations if options[:uuid] str = Base32::URL.encode_uuid(options[:uuid], split: 7, checksum: true) else random_int = SecureRandom.random_number(32 ** 7..(32 ** 8) - 1) suffix = Base32::URL.encode(random_int, checksum: true) str = "#{suffix[0, 5]}-#{suffix[5, 10]}" end "https://doi.org/#{prefix}/#{str}" end def decode_doi(doi, options = {}) suffix = doi.split("/", 5).last if options[:uuid] Base32::URL.decode_uuid(suffix) else Base32::URL.decode(suffix) end end def encode_container_id # suffix has 5 digits plus two checksum digits. With base32 there are # 32 possible digits, so 5 digits gives 32^5 possible combinations random_int = SecureRandom.random_number(32 ** 4..(32 ** 5) - 1) Base32::URL.encode(random_int, checksum: true) end def decode_container_id(id) Base32::URL.decode(id) end def json_feed_not_indexed_url(date_indexed) "https://rogue-scholar.org/api/posts/not_indexed/#{date_indexed}" end def json_feed_unregistered_url "https://rogue-scholar.org/api/posts/unregistered" end def json_feed_by_blog_url(blog_id) "https://rogue-scholar.org/api/blogs/#{blog_id}" end end end