# frozen_string_literal: true module Commonmeta module Utils NORMALIZED_LICENSES = { 'https://creativecommons.org/licenses/by/1.0' => 'https://creativecommons.org/licenses/by/1.0/legalcode', 'https://creativecommons.org/licenses/by/2.0' => 'https://creativecommons.org/licenses/by/2.0/legalcode', 'https://creativecommons.org/licenses/by/2.5' => 'https://creativecommons.org/licenses/by/2.5/legalcode', 'https://creativecommons.org/licenses/by/3.0' => 'https://creativecommons.org/licenses/by/3.0/legalcode', 'https://creativecommons.org/licenses/by/3.0/us' => 'https://creativecommons.org/licenses/by/3.0/legalcode', 'https://creativecommons.org/licenses/by/4.0' => 'https://creativecommons.org/licenses/by/4.0/legalcode', 'https://creativecommons.org/licenses/by-nc/1.0' => 'https://creativecommons.org/licenses/by-nc/1.0/legalcode', 'https://creativecommons.org/licenses/by-nc/2.0' => 'https://creativecommons.org/licenses/by-nc/2.0/legalcode', 'https://creativecommons.org/licenses/by-nc/2.5' => 'https://creativecommons.org/licenses/by-nc/2.5/legalcode', 'https://creativecommons.org/licenses/by-nc/3.0' => 'https://creativecommons.org/licenses/by-nc/3.0/legalcode', 'https://creativecommons.org/licenses/by-nc/4.0' => 'https://creativecommons.org/licenses/by-nc/4.0/legalcode', 'https://creativecommons.org/licenses/by-nd-nc/1.0' => 'https://creativecommons.org/licenses/by-nd-nc/1.0/legalcode', 'https://creativecommons.org/licenses/by-nd-nc/2.0' => 'https://creativecommons.org/licenses/by-nd-nc/2.0/legalcode', 'https://creativecommons.org/licenses/by-nd-nc/2.5' => 'https://creativecommons.org/licenses/by-nd-nc/2.5/legalcode', 'https://creativecommons.org/licenses/by-nd-nc/3.0' => 'https://creativecommons.org/licenses/by-nd-nc/3.0/legalcode', 'https://creativecommons.org/licenses/by-nd-nc/4.0' => 'https://creativecommons.org/licenses/by-nd-nc/4.0/legalcode', 'https://creativecommons.org/licenses/by-nc-sa/1.0' => 'https://creativecommons.org/licenses/by-nc-sa/1.0/legalcode', 'https://creativecommons.org/licenses/by-nc-sa/2.0' => 'https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode', 'https://creativecommons.org/licenses/by-nc-sa/2.5' => 'https://creativecommons.org/licenses/by-nc-sa/2.5/legalcode', 'https://creativecommons.org/licenses/by-nc-sa/3.0' => 'https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode', 'https://creativecommons.org/licenses/by-nc-sa/4.0' => 'https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode', 'https://creativecommons.org/licenses/by-nd/1.0' => 'https://creativecommons.org/licenses/by-nd/1.0/legalcode', 'https://creativecommons.org/licenses/by-nd/2.0' => 'https://creativecommons.org/licenses/by-nd/2.0/legalcode', 'https://creativecommons.org/licenses/by-nd/2.5' => 'https://creativecommons.org/licenses/by-nd/2.5/legalcode', 'https://creativecommons.org/licenses/by-nd/3.0' => 'https://creativecommons.org/licenses/by-nd/3.0/legalcode', 'https://creativecommons.org/licenses/by-nd/4.0' => 'https://creativecommons.org/licenses/by-nd/2.0/legalcode', 'https://creativecommons.org/licenses/by-sa/1.0' => 'https://creativecommons.org/licenses/by-sa/1.0/legalcode', 'https://creativecommons.org/licenses/by-sa/2.0' => 'https://creativecommons.org/licenses/by-sa/2.0/legalcode', 'https://creativecommons.org/licenses/by-sa/2.5' => 'https://creativecommons.org/licenses/by-sa/2.5/legalcode', 'https://creativecommons.org/licenses/by-sa/3.0' => 'https://creativecommons.org/licenses/by-sa/3.0/legalcode', 'https://creativecommons.org/licenses/by-sa/4.0' => 'https://creativecommons.org/licenses/by-sa/4.0/legalcode', 'https://creativecommons.org/licenses/by-nc-nd/1.0' => 'https://creativecommons.org/licenses/by-nc-nd/1.0/legalcode', 'https://creativecommons.org/licenses/by-nc-nd/2.0' => 'https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode', 'https://creativecommons.org/licenses/by-nc-nd/2.5' => 'https://creativecommons.org/licenses/by-nc-nd/2.5/legalcode', 'https://creativecommons.org/licenses/by-nc-nd/3.0' => 'https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode', 'https://creativecommons.org/licenses/by-nc-nd/4.0' => 'https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode', 'https://creativecommons.org/licenses/publicdomain' => 'https://creativecommons.org/licenses/publicdomain/', 'https://creativecommons.org/publicdomain/zero/1.0' => 'https://creativecommons.org/publicdomain/zero/1.0/legalcode' } # source: https://www.bibtex.com/e/entry-types/ BIB_TO_CM_TRANSLATIONS = { 'article' => 'JournalArticle', 'book' => 'Book', 'booklet' => 'Book', 'inbook' => 'BookChapter', 'inproceedings' => 'ProceedingsArticle', 'manual' => 'Report', 'mastersthesis' => 'Dissertation', 'misc' => 'Other', 'phdthesis' => 'Dissertation', 'proceedings' => 'Proceedings', 'techreport' => 'Report', 'unpublished' => 'Manuscript' } CM_TO_BIB_TRANSLATIONS = { 'Article' => 'article', 'Book' => 'book', 'BookChapter' => 'inbook', 'Dissertation' => 'phdthesis', 'JournalArticle' => 'article', 'Manuscript' => 'unpublished', 'Other' => 'misc', 'Proceedings' => 'proceedings', 'ProceedingsArticle' => 'inproceedings', 'Report' => 'techreport' } # source: https://docs.citationstyles.org/en/stable/specification.html?highlight=book#appendix-iii-types CSL_TO_CM_TRANSLATIONS = { 'article' => 'Article', 'article-journal' => 'JournalArticle', 'article-magazine' => 'Article', 'article-newspaper' => 'Article', 'bill' => 'LegalDocument', 'book' => 'Book', 'broadcast' => 'Audiovisual', 'chapter' => 'BookChapter', 'classic' => 'Book', 'collection' => 'Collection', 'dataset' => 'Dataset', 'document' => 'Document', 'entry' => 'Entry', 'entry-dictionary' => 'Entry', 'entry-encyclopedia' => 'Entry', 'event' => 'Event', 'figure' => 'Figure', 'graphic' => 'Image', 'hearing' => 'LegalDocument', 'interview' => 'Document', 'legal_case' => 'LegalDocument', 'legislation' => 'LegalDocument', 'manuscript' => 'Manuscript', 'map' => 'Map', 'motion_picture' => 'Audiovisual', 'musical_score' => 'Document', 'pamphlet' => 'Document', 'paper-conference' => 'ProceedingsArticle', 'patent' => 'Patent', 'performance' => 'Performance', 'periodical' => 'Journal', 'personal_communication' => 'PersonalCommunication', 'post' => 'Post', 'post-weblog' => 'Article', 'regulation' => 'LegalDocument', 'report' => 'Report', 'review' => 'Review', 'review-book' => 'Review', 'software' => 'Software', 'song' => 'Audiovisual', 'speech' => 'Speech', 'standard' => 'Standard', 'thesis' => 'Dissertation', 'treaty' => 'LegalDocument', 'webpage' => 'WebPage' } CM_TO_CSL_TRANSLATIONS = { 'Article' => 'article', 'JournalArticle' => 'article-journal', 'Book' => 'book', 'BookChapter' => 'chapter', 'Collection' => 'collection', 'Dataset' => 'dataset', 'Document' => 'document', 'Entry' => 'entry', 'Event' => 'event', 'Figure' => 'figure', 'Image' => 'graphic', 'LegalDocument' => 'legal_case', 'Manuscript' => 'manuscript', 'Map' => 'map', 'Audiovisual' => 'motion_picture', 'Patent' => 'patent', 'Performance' => 'performance', 'Journal' => 'periodical', 'PersonalCommunication' => 'personal_communication', 'Post' => 'post', 'Report' => 'report', 'Review' => 'review', 'Software' => 'software', 'Speech' => 'speech', 'Standard' => 'standard', 'Dissertation' => 'thesis', 'WebPage' => 'webpage' } # source: http://api.crossref.org/types CR_TO_CM_TRANSLATIONS = { 'BookChapter' => 'BookChapter', 'BookPart' => 'BookPart', 'BookSection' => 'BookSection', 'BookSeries' => 'BookSeries', 'BookSet' => 'BookSet', 'BookTrack' => 'BookTrack', 'Book' => 'Book', 'Component' => 'Component', 'Database' => 'Database', 'Dataset' => 'Dataset', 'Dissertation' => 'Dissertation', 'EditedBook' => 'EditedBook', 'Grant' => 'Grant', 'JournalArticle' => 'JournalArticle', 'JournalIssue' => 'JournalIssue', 'JournalVolume' => 'JournalVolume', 'Journal' => 'Journal', 'Monograph' => 'Book', 'Other' => 'Other', 'PeerReview' => 'PeerReview', 'PostedContent' => 'Article', 'ProceedingsArticle' => 'ProceedingsArticle', 'ProceedingsSeries' => 'ProceedingsSeries', 'Proceedings' => 'Proceedings', 'ReferenceBook' => 'ReferenceBook', 'ReferenceEntry' => 'Entry', 'ReportComponent' => 'ReportComponent', 'ReportSeries' => 'ReportSeries', 'Report' => 'Report', 'Standard' => 'Standard' } CM_TO_CR_TRANSLATIONS = { 'Article' => 'PostedContent', 'BookChapter' => 'BookChapter', 'BookSeries' => 'BookSeries', 'Book' => 'Book', 'Component' => 'Component', 'Dataset' => 'Dataset', 'Dissertation' => 'Dissertation', 'Grant' => 'Grant', 'JournalArticle' => 'JournalArticle', 'JournalIssue' => 'JournalIssue', 'JournalVolume' => 'JournalVolume', 'Journal' => 'Journal', 'ProceedingsArticle' => 'ProceedingsArticle', 'ProceedingsSeries' => 'ProceedingsSeries', 'Proceedings' => 'Proceedings', 'ReportComponent' => 'ReportComponent', 'ReportSeries' => 'ReportSeries', 'Report' => 'Report', 'PeerReview' => 'PeerReview', 'Other' => 'Other' } # source: https://github.com/datacite/schema/blob/master/source/meta/kernel-4/include/datacite-resourceType-v4.xsd DC_TO_CM_TRANSLATIONS = { 'Audiovisual' => 'Audiovisual', 'BlogPosting' => 'Article', 'Book' => 'Book', 'BookChapter' => 'BookChapter', 'Collection' => 'Collection', 'ComputationalNotebook' => 'ComputationalNotebook', 'ConferencePaper' => 'ProceedingsArticle', 'ConferenceProceeding' => 'Proceedings', 'DataPaper' => 'JournalArticle', 'Dataset' => 'Dataset', 'Dissertation' => 'Dissertation', 'Event' => 'Event', 'Image' => 'Image', 'InteractiveResource' => 'InteractiveResource', 'Journal' => 'Journal', 'JournalArticle' => 'JournalArticle', 'Model' => 'Model', 'OutputManagementPlan' => 'OutputManagementPlan', 'PeerReview' => 'PeerReview', 'PhysicalObject' => 'PhysicalObject', 'Poster' => 'Speech', 'Preprint' => 'Article', 'Report' => 'Report', 'Service' => 'Service', 'Software' => 'Software', 'Sound' => 'Sound', 'Standard' => 'Standard', 'Text' => 'Document', 'Thesis' => 'Dissertation', 'Workflow' => 'Workflow', 'Other' => 'Other' } CM_TO_DC_TRANSLATIONS = { 'Article' => 'Preprint', 'Audiovisual' => 'Audiovisual', 'Book' => 'Book', 'BookChapter' => 'BookChapter', 'Collection' => 'Collection', 'Dataset' => 'Dataset', 'Dissertation' => 'Dissertation', 'Document' => 'Text', 'Entry' => 'Text', 'Event' => 'Event', 'Figure' => 'Image', 'Image' => 'Image', 'JournalArticle' => 'JournalArticle', 'LegalDocument' => 'Text', 'Manuscript' => 'Text', 'Map' => 'Image', 'Patent' => 'Text', 'Performance' => 'Audiovisual', 'PersonalCommunication' => 'Text', 'Post' => 'Text', 'ProceedingsArticle' => 'ConferencePaper', 'Proceedings' => 'ConferenceProceeding', 'Report' => 'Report', 'PeerReview' => 'PeerReview', 'Software' => 'Software', 'Sound' => 'Sound', 'Standard' => 'Standard', 'WebPage' => 'Text' } RIS_TO_CM_TRANSLATIONS = { 'ABST' => 'Text', 'ADVS' => 'Text', 'AGGR' => 'Text', 'ANCIENT' => 'Text', 'ART' => 'Text', 'BILL' => 'Text', 'BLOG' => 'Text', 'BOOK' => 'Book', 'CASE' => 'Text', 'CHAP' => 'BookChapter', 'CHART' => 'Text', 'CLSWK' => 'Text', 'CTLG' => 'Collection', 'COMP' => 'Software', 'DATA' => 'Dataset', 'DBASE' => 'Database', 'DICT' => 'Dictionary', 'EBOOK' => 'Book', 'ECHAP' => 'BookChapter', 'EDBOOK' => 'Book', 'EJOUR' => 'JournalArticle', 'ELEC' => 'Text', 'ENCYC' => 'Encyclopedia', 'EQUA' => 'Equation', 'FIGURE' => 'Image', 'GEN' => 'CreativeWork', 'GOVDOC' => 'GovernmentDocument', 'GRANT' => 'Grant', 'HEAR' => 'Hearing', 'ICOMM' => 'Text', 'INPR' => 'Text', 'JFULL' => 'JournalArticle', 'JOUR' => 'JournalArticle', 'LEGAL' => 'LegalRuleOrRegulation', 'MANSCPT' => 'Text', 'MAP' => 'Map', 'MGZN' => 'MagazineArticle', 'MPCT' => 'Audiovisual', 'MULTI' => 'Audiovisual', 'MUSIC' => 'MusicScore', 'NEWS' => 'NewspaperArticle', 'PAMP' => 'Pamphlet', 'PAT' => 'Patent', 'PCOMM' => 'PersonalCommunication', 'RPRT' => 'Report', 'SER' => 'SerialPublication', 'SLIDE' => 'Slide', 'SOUND' => 'SoundRecording', 'STAND' => 'Standard', 'THES' => 'Dissertation', 'UNBILL' => 'UnenactedBill', 'UNPB' => 'UnpublishedWork', 'VIDEO' => 'Audiovisual', 'WEB' => 'WebPage' } CM_TO_RIS_TRANSLATIONS = { 'Article' => 'JOUR', 'Audiovisual' => 'VIDEO', 'Book' => 'BOOK', 'BookChapter' => 'CHAP', 'Collection' => 'CTLG', 'Dataset' => 'DATA', 'Dissertation' => 'THES', 'Document' => 'GEN', 'Entry' => 'DICT', 'Event' => 'GEN', 'Figure' => 'FIGURE', 'Image' => 'FIGURE', 'JournalArticle' => 'JOUR', 'LegalDocument' => 'GEN', 'Manuscript' => 'GEN', 'Map' => 'MAP', 'Patent' => 'PAT', 'Performance' => 'GEN', 'PersonalCommunication' => 'PCOMM', 'Post' => 'GEN', 'ProceedingsArticle' => 'CPAPER', 'Proceedings' => 'CONF', 'Report' => 'RPRT', 'Review' => 'GEN', 'Software' => 'COMP', 'Sound' => 'SOUND', 'Standard' => 'STAND', 'WebPage' => 'WEB' } SO_TO_CM_TRANSLATIONS = { 'Article' => 'Article', 'BlogPosting' => 'Article', 'Book' => 'Book', 'BookChapter' => 'BookChapter', 'CreativeWork' => 'Other', 'Dataset' => 'Dataset', 'Dissertation' => 'Dissertation', 'NewsArticle' => 'Article', 'Legislation' => 'LegalDocument', 'ScholarlyArticle' => 'JournalArticle', 'SoftwareSourceCode' => 'Software' } CM_TO_SO_TRANSLATIONS = { 'Article' => 'Article', 'Audiovisual' => 'CreativeWork', 'Book' => 'Book', 'BookChapter' => 'BookChapter', 'Collection' => 'CreativeWork', 'Dataset' => 'Dataset', 'Dissertation' => 'Dissertation', 'Document' => 'CreativeWork', 'Entry' => 'CreativeWork', 'Event' => 'CreativeWork', 'Figure' => 'CreativeWork', 'Image' => 'CreativeWork', 'JournalArticle' => 'ScholarlyArticle', 'LegalDocument' => 'Legislation', 'Software' => 'SoftwareSourceCode' } CM_TO_JATS_TRANSLATIONS = { 'Proceedings' => 'working-paper', 'ReferenceBook' => 'book', 'JournalIssue' => 'journal', 'ProceedingsArticle' => 'working-paper', 'Other' => nil, 'Dissertation' => nil, 'Dataset' => 'data', 'Document' => 'journal', 'EditedBook' => 'book', 'JournalArticle' => 'journal', 'Journal' => 'journal', 'Report' => 'report', 'BookSeries' => 'book', 'ReportSeries' => 'report', 'BookTrack' => 'book', 'Standard' => 'standard', 'BookSection' => 'chapter', 'BookPart' => 'chapter', 'Book' => 'book', 'BookChapter' => 'chapter', 'StandardSeries' => 'standard', 'Monograph' => 'book', 'Component' => nil, 'ReferenceEntry' => nil, 'JournalVolume' => 'journal', 'BookSet' => 'book', 'Article' => 'journal', 'Software' => 'software' } UNKNOWN_INFORMATION = { ':unac' => 'temporarily inaccessible', ':unal' => 'unallowed, suppressed intentionally', ':unap' => 'not applicable, makes no sense', ':unas' => 'value unassigned (e.g., Untitled)', ':unav' => 'value unavailable, possibly unknown', ':unkn' => 'known to be unknown (e.g., Anonymous, Inconnue)', ':none' => 'never had a value, never will', ':null' => 'explicitly and meaningfully empty', ':tba' => 'to be assigned or announced later', ':etal' => 'too numerous to list (et alia)' } def find_from_format(id: nil, string: nil, ext: nil, filename: nil) if id.present? find_from_format_by_id(id) elsif string.present? && ext.present? find_from_format_by_ext(string, ext: ext) elsif string.present? find_from_format_by_string(string) elsif filename.present? find_from_format_by_filename(filename) else 'datacite' end end def find_from_format_by_id(id) id = normalize_id(id) if %r{\A(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org)/)?(doi:)?(10\.\d{4,5}/.+)\z}.match?(id) ra = get_doi_ra(id) %w[DataCite Crossref mEDRA KISTI JaLC OP].include?(ra) ? ra.downcase : nil elsif %r{\A(?:(http|https):/(/)?orcid\.org/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z}.match?(id) 'orcid' elsif %r{\A(http|https):/(/)?github\.com/(.+)/package.json\z}.match?(id) 'npm' elsif %r{\A(http|https):/(/)?github\.com/(.+)/codemeta.json\z}.match?(id) 'codemeta' elsif %r{\A(http|https):/(/)?github\.com/(.+)/CITATION.cff\z}.match?(id) 'cff' elsif %r{\A(http|https):/(/)?github\.com/(.+)\z}.match?(id) 'cff' else 'schema_org' end end def find_from_format_by_filename(filename) if filename == 'package.json' 'npm' elsif filename == 'CITATION.cff' 'cff' end end def find_from_format_by_ext(string, options = {}) case options[:ext] when '.bib' 'bibtex' when '.ris' 'ris' when '.xml', '.json' find_from_format_by_string(string) end end def find_from_format_by_string(string) begin # try to parse as JSON hsh = MultiJson.load(string).to_h if hsh.dig('@context') && URI.parse(hsh.dig('@context')).host == 'schema.org' return 'schema_org' elsif hsh.dig('schemaVersion').to_s.start_with?('http://datacite.org/schema/kernel') return 'datacite' elsif hsh.dig('source') == 'Crossref' return 'crossref' elsif hsh.dig('issued', 'date-parts').present? return 'csl' elsif URI.parse(hsh.dig('@context')).to_s == 'https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld' return 'codemeta' end rescue MultiJson::ParseError end begin # try to parse as XML hsh = Hash.from_xml(string) return 'crossref_xml' if hsh.to_h.dig('crossref_result').present? rescue Nokogiri::XML::SyntaxError end begin # try to parse as YAML hsh = YAML.load(string, permitted_classes: [Date]) return 'cff' if hsh.is_a?(Hash) && hsh.fetch('cff-version', nil).present? rescue Psych::SyntaxError end if string.start_with?('TY - ') 'ris' elsif BibTeX.parse(string).first 'bibtex' end end def orcid_from_url(url) Array(%r{\A:(http|https)://orcid\.org/(.+)}.match(url)).last end def orcid_as_url(orcid) "https://orcid.org/#{orcid}" if orcid.present? end def validate_orcid(orcid) orcid = Array(%r{\A(?:(?:http|https)://(?:(?:www|sandbox)?\.)?orcid\.org/)?(\d{4}[[:space:]-]\d{4}[[:space:]-]\d{4}[[:space:]-]\d{3}[0-9X]+)\z}.match(orcid)).last orcid.gsub(/[[:space:]]/, '-') if orcid.present? end def validate_orcid_scheme(orcid_scheme) Array(%r{\A(http|https)://(www\.)?(orcid\.org)}.match(orcid_scheme)).last end def validate_url(str) if %r{\A(?:(http|https)://(dx\.)?doi.org/)?(doi:)?(10\.\d{4,5}/.+)\z}.match?(str) 'DOI' elsif %r{\A(http|https)://}.match?(str) 'URL' elsif /\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\z/.match?(str) 'ISSN' end end def parse_attributes(element, options = {}) content = options[:content] || '__content__' if element.is_a?(String) && options[:content].nil? CGI.unescapeHTML(element) elsif element.is_a?(Hash) element.fetch(CGI.unescapeHTML(content), nil) elsif element.is_a?(Array) a = element.map { |e| e.is_a?(Hash) ? e.fetch(CGI.unescapeHTML(content), nil) : e }.uniq a = options[:first] ? a.first : a.unwrap end end def normalize_id(id, options = {}) return nil unless id.present? # check for valid DOI doi = normalize_doi(id, options) return doi if doi.present? # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w[http https].include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end def normalize_url(id, options = {}) return nil unless id.present? # handle info URIs return id if id.to_s.start_with?('info') # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w[http https ftp].include?(uri.scheme) # optionally turn into https URL uri.scheme = 'https' if options[:https] # clean up URL uri.path = PostRank::URI.clean(uri.path) uri.to_s rescue Addressable::URI::InvalidURIError nil end def normalize_cc_url(id) id = normalize_url(id, https: true) NORMALIZED_LICENSES.fetch(id, id) end def normalize_orcid(orcid) orcid = validate_orcid(orcid) return nil unless orcid.present? # turn ORCID ID into URL 'https://orcid.org/' + Addressable::URI.encode(orcid) end # pick electronic issn if there are multiple # format issn as xxxx-xxxx def normalize_issn(input, options = {}) content = options[:content] || '__content__' issn = if input.blank? nil elsif input.is_a?(String) && options[:content].nil? input elsif input.is_a?(Hash) input.fetch(content, nil) elsif input.is_a?(Array) a = input.find { |a| a['media_type'] == 'electronic' } || input.first a.fetch(content, nil) end case issn.to_s.length when 9 issn when 8 issn[0..3] + '-' + issn[4..7] end end # find Creative Commons or OSI license in licenses array, normalize url and name def normalize_licenses(licenses) standard_licenses = Array.wrap(licenses).map do |l| URI.parse(l['url']) end.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] } return licenses unless standard_licenses.present? # use HTTPS uri.scheme = 'https' # use host name without subdomain uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last # normalize URLs if uri.host == 'creativecommons.org' uri.path = uri.path.split('/')[0..-2].join('/') if uri.path.split('/').last == 'legalcode' uri.path << '/' unless uri.path.end_with?('/') else uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '') uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase } uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize } uri.path = uri.path.sub(/([^0-9-]+)(-)?([1-9])?(\.)?([0-9])?$/) do m = Regexp.last_match text = m[1] if m[3].present? version = [m[3], m[5].presence || '0'].join('.') [text, version].join('-') else text end end end uri.to_s rescue URI::InvalidURIError nil end def to_datacite(element, options = {}) a = Array.wrap(element).map do |e| e.each_with_object({}) do |(k, v), h| h[k.dasherize] = v end end options[:first] ? a.unwrap : a.presence end def from_datacite(element) mapping = { 'nameType' => 'type', 'creatorName' => 'name' } map_hash_keys(element: element, mapping: mapping) end def to_schema_org(element) mapping = { 'type' => '@type', 'id' => '@id', 'title' => 'name' } map_hash_keys(element: element, mapping: mapping) end def to_schema_org_container(element, options = {}) return nil unless element.is_a?(Hash) || (element.nil? && options[:container_title].present?) issn = element['identifier'] if element['identifierType'] == 'ISSN' id = issn.blank? ? element['identifier'] : nil name = options[:container_title] || element['title'] type = id || name ? options[:type] || element['type'] : nil { '@id' => id, '@type' => type, 'name' => name, 'issn' => issn }.compact end def to_schema_org_identifiers(element, _options = {}) Array.wrap(element).map do |ai| { '@type' => 'PropertyValue', 'propertyID' => ai['alternateIdentifierType'], 'value' => ai['alternateIdentifier'] } end.unwrap end def to_schema_org_relation(related_identifiers: nil, relation_type: nil) return nil unless related_identifiers.present? && relation_type.present? relation_type = if relation_type == 'References' %w[References Cites Documents] else [relation_type] end Array.wrap(related_identifiers).select do |ri| relation_type.include?(ri['relationType']) end.map do |r| if r['relatedIdentifierType'] == 'ISSN' && r['relationType'] == 'IsPartOf' { '@type' => 'Periodical', 'issn' => r['relatedIdentifier'] }.compact else { '@id' => normalize_id(r['relatedIdentifier']), '@type' => DC_TO_SO_TRANSLATIONS[r['resourceTypeGeneral']] || 'CreativeWork' }.compact end end.unwrap end def to_schema_org_funder(funding_references) return nil unless funding_references.present? Array.wrap(funding_references).map do |fr| { '@id' => fr['funderIdentifier'], '@type' => 'Organization', 'name' => fr['funderName'] }.compact end.unwrap end def to_schema_org_citation(reference) return nil unless reference.present? { '@type' => 'CreativeWork', '@id' => reference['doi'] ? normalize_id(reference['doi']) : nil, 'name' => reference['title'], 'datePublished' => reference['publicationYear'] }.compact end def to_schema_org_spatial_coverage(geo_location) return nil unless geo_location.present? Array.wrap(geo_location).each_with_object([]) do |gl, sum| if gl.fetch('geoLocationPoint', nil) sum << { '@type' => 'Place', 'geo' => { '@type' => 'GeoCoordinates', 'address' => gl['geoLocationPlace'], 'latitude' => gl.dig('geoLocationPoint', 'pointLatitude'), 'longitude' => gl.dig('geoLocationPoint', 'pointLongitude') } }.compact end if gl.fetch('geoLocationBox', nil) sum << { '@type' => 'Place', 'geo' => { '@type' => 'GeoShape', 'address' => gl['geoLocationPlace'], 'box' => [gl.dig('geoLocationBox', 'southBoundLatitude'), gl.dig('geoLocationBox', 'westBoundLongitude'), gl.dig('geoLocationBox', 'northBoundLatitude'), gl.dig('geoLocationBox', 'eastBoundLongitude')].compact.join(' ').presence }.compact }.compact end if gl.fetch('geoLocationPolygon', nil) sum << { '@type' => 'Place', 'geo' => { '@type' => 'GeoShape', 'address' => gl['geoLocationPlace'], 'polygon' => Array.wrap(gl.dig('geoLocationPolygon')).map do |glp| Array.wrap(glp).map do |glpp| [glpp.dig('polygonPoint', 'pointLongitude'), glpp.dig('polygonPoint', 'pointLatitude')].compact end.compact end.compact.presence } } end next unless gl.fetch('geoLocationPlace', nil) && !gl.fetch('geoLocationPoint', nil) && !gl.fetch('geoLocationBox', nil) && !gl.fetch( 'geoLocationPolygon', nil ) sum << { '@type' => 'Place', 'geo' => { '@type' => 'GeoCoordinates', 'address' => gl['geoLocationPlace'] } }.compact end.unwrap end def from_schema_org(element) mapping = { '@type' => 'type', '@id' => 'id' } map_hash_keys(element: element, mapping: mapping) end def map_hash_keys(element: nil, mapping: nil) Array.wrap(element).map do |a| a.map { |k, v| [mapping.fetch(k, k), v] }.reduce({}) do |hsh, (k, v)| if k == 'affiliation' && v.is_a?(Array) hsh[k] = v.map do |affiliation| if affiliation.is_a?(Hash) affiliation.merge('@type' => 'Organization') else affiliation end end hsh elsif k == 'type' && v.is_a?(String) hsh[k] = v.capitalize hsh elsif v.is_a?(Hash) hsh[k] = to_schema_org(v) hsh else hsh[k] = v hsh end end end.unwrap end def to_identifier(identifier) { '@type' => 'PropertyValue', 'propertyID' => identifier['relatedIdentifierType'], 'value' => identifier['relatedIdentifier'] } end def from_csl(element) Array.wrap(element).map do |a| if a['literal'].present? a['type'] = 'Organization' a['name'] = a['literal'] elsif a['name'].present? a['type'] = 'Organization' elsif a['given'].present? || a['family'].present? a['type'] = 'Person' end a['givenName'] = a['given'] a['familyName'] = a['family'] a.except('given', 'family', 'literal').compact end.unwrap end def to_csl(element) Array.wrap(element).map do |a| a['family'] = a['familyName'] a['given'] = a['givenName'] a['literal'] = a['name'] unless a['familyName'].present? a.except('nameType', 'type', '@type', 'id', '@id', 'name', 'familyName', 'givenName', 'affiliation', 'contributorType').compact end.presence end def to_ris(element) Array.wrap(element).map do |a| if a['familyName'].present? [a['familyName'], a['givenName']].join(', ') else a['name'] end end.unwrap end def sanitize(text, options = {}) options[:tags] ||= Set.new(%w[strong em b i code pre sub sup br]) content = options[:content] || '__content__' custom_scrubber = Commonmeta::WhitelistScrubber.new(options) if text.is_a?(String) # remove excessive internal whitespace with squish Loofah.scrub_fragment(text, custom_scrubber).to_s.squish elsif text.is_a?(Hash) sanitize(text.fetch(content, nil)) elsif text.is_a?(Array) a = text.map { |e| e.is_a?(Hash) ? sanitize(e.fetch(content, nil)) : sanitize(e) }.uniq a = options[:first] ? a.first : a.unwrap end end def github_from_url(url) return {} unless %r{\Ahttps://github\.com/(.+)(?:/)?(.+)?(?:/tree/)?(.*)\z}.match?(url) words = URI.parse(url).path[1..-1].split('/') path = words.length > 3 ? words[4...words.length].join('/') : nil { owner: words[0], repo: words[1], release: words[3], path: path }.compact end def github_repo_from_url(url) github_from_url(url).fetch(:repo, nil) end def github_release_from_url(url) github_from_url(url).fetch(:release, nil) end def github_owner_from_url(url) github_from_url(url).fetch(:owner, nil) end def github_as_owner_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present? end def github_as_repo_url(url) github_hash = github_from_url(url) return unless github_hash[:repo].present? "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}" end def github_as_release_url(url) github_hash = github_from_url(url) return unless github_hash[:release].present? "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}" end def github_as_codemeta_url(url) github_hash = github_from_url(url) if github_hash[:path].to_s.end_with?('codemeta.json') "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}" elsif github_hash[:owner].present? "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/master/codemeta.json" end end def github_as_cff_url(url) github_hash = github_from_url(url) if github_hash[:path].to_s.end_with?('CITATION.cff') "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}" elsif github_hash[:owner].present? "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/main/CITATION.cff" end end def get_date_parts(iso8601_time) return { 'date-parts' => [[]] } if iso8601_time.nil? year = iso8601_time[0..3].to_i month = iso8601_time[5..6].to_i day = iso8601_time[8..9].to_i { 'date-parts' => [[year, month, day].reject { |part| part == 0 }] } rescue TypeError nil end def get_date_from_date_parts(date_as_parts) date_parts = date_as_parts.fetch('date-parts', []).first return nil if date_parts == [nil] year = date_parts[0] month = date_parts[1] day = date_parts[2] get_date_from_parts(year, month, day) rescue NoMethodError # if date_parts is nil nil end def get_date_from_parts(year, month = nil, day = nil) [year.to_s.rjust(4, '0'), month.to_s.rjust(2, '0'), day.to_s.rjust(2, '0')].reject do |part| part == '00' end.join('-') end def get_date_parts_from_parts(year, month = nil, day = nil) { 'date-parts' => [[year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }] } end def get_iso8601_date(iso8601_time) return nil if iso8601_time.nil? || iso8601_time.length < 4 case iso8601_time.length when 4 iso8601_time[0..3] when 7 iso8601_time[0..6] else iso8601_time[0..9] end end def get_year_month(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] [year.to_i, month.to_i].reject { |part| part == 0 } end def get_year_month_day(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] day = iso8601_time[8..9] [year.to_i, month.to_i, day.to_i].reject { |part| part == 0 } end # parsing of incomplete iso8601 timestamps such as 2015-04 is broken # in standard library # return nil if invalid iso8601 timestamp def get_datetime_from_iso8601(iso8601_time) ISO8601::DateTime.new(iso8601_time).to_time.utc rescue StandardError nil end # strip milliseconds if there is a time, as it interferes with edtc parsing # keep dates unchanged def strip_milliseconds(iso8601_time) return iso8601_time.split(' ').first if iso8601_time.to_s.include? ' ' return iso8601_time.split('.').first + 'Z' if iso8601_time.to_s.include? '.' iso8601_time end # iso8601 datetime without hyphens and colons, used by Crossref # return nil if invalid def get_datetime_from_time(time) DateTime.strptime(time.to_s, '%Y%m%d%H%M%S').strftime('%Y-%m-%dT%H:%M:%SZ') rescue ArgumentError nil end def get_date(dates, date_type) dd = Array.wrap(dates).find { |d| d['dateType'] == date_type } || {} dd.fetch('date', nil) end # convert commonmeta dates to DataCite format def get_dates_from_date(date) return nil if date.nil? mapping = { 'published' => 'issued' } date = map_hash_keys(element: date, mapping: mapping) date.map do |k, v| { 'date' => v, 'dateType' => k.capitalize } end end def get_contributor(contributor, contributor_type) contributor.select { |c| c['contributorType'] == contributor_type } end def get_identifier(identifiers, identifier_type) id = Array.wrap(identifiers).find { |i| i['identifierType'] == identifier_type } || {} id.fetch('identifier', nil) end def get_identifier_type(identifier_type) return nil unless identifier_type.present? identifierTypes = { 'ark' => 'ARK', 'arxiv' => 'arXiv', 'bibcode' => 'bibcode', 'doi' => 'DOI', 'ean13' => 'EAN13', 'eissn' => 'EISSN', 'handle' => 'Handle', 'igsn' => 'IGSN', 'isbn' => 'ISBN', 'issn' => 'ISSN', 'istc' => 'ISTC', 'lissn' => 'LISSN', 'lsid' => 'LSID', 'pmid' => 'PMID', 'purl' => 'PURL', 'upc' => 'UPC', 'url' => 'URL', 'urn' => 'URN', 'md5' => 'md5', 'minid' => 'minid', 'dataguid' => 'dataguid' } identifierTypes[identifier_type.downcase] || identifier_type end def get_series_information(str) return {} unless str.present? str = str.split(',').map(&:strip) title = str.first volume_issue = str.length > 2 ? str[1].rpartition(/\(([^)]+)\)/) : nil volume = volume_issue.present? ? volume_issue[0].presence || volume_issue[2].presence : nil issue = volume_issue.present? ? volume_issue[1][1...-1].presence : nil pages = str.length > 1 ? str.last : nil first_page = pages.present? ? pages.split('-').map(&:strip)[0] : nil last_page = pages.present? ? pages.split('-').map(&:strip)[1] : nil { 'title' => title, 'volume' => volume, 'issue' => issue, 'firstPage' => first_page, 'lastPage' => last_page }.compact end def jsonlint(json) return ['No JSON provided'] unless json.present? error_array = [] linter = JsonLint::Linter.new linter.send(:check_data, json, error_array) error_array end def name_to_spdx(name) spdx = JSON.load(File.read(File.expand_path('../../resources/spdx/licenses.json', __dir__))).fetch('licenses') license = spdx.find do |l| l['name'] == name || l['licenseId'] == name || l['seeAlso'].first == normalize_cc_url(name) end if license { 'id' => license['licenseId'], 'url' => license['seeAlso'].first }.compact else { 'rights' => name } end end def hsh_to_spdx(hsh) spdx = JSON.load(File.read(File.expand_path('../../resources/spdx/licenses.json', __dir__))).fetch('licenses') hsh['rightsUri'] = hsh.delete('rightsURI') if hsh['rightsUri'].blank? license = spdx.find do |l| l['licenseId'].casecmp?(hsh['rightsIdentifier']) || l['seeAlso'].first == normalize_cc_url(hsh['rightsUri']) || l['name'] == hsh['rights'] || l['seeAlso'].first == normalize_cc_url(hsh['rights']) end if license { 'id' => license['licenseId'], 'url' => license['seeAlso'].first }.compact else { 'id' => hsh['rightsIdentifier'].present? ? hsh['rightsIdentifier'].downcase : nil, 'url' => hsh['rightsURI'] || hsh['rightsUri'] }.compact end end def spdx_to_hsh(hsh) return nil unless hsh.present? && hsh.is_a?(Hash) spdx = JSON.load(File.read(File.expand_path('../../resources/spdx/licenses.json', __dir__))).fetch('licenses') license = spdx.find { |l| l['licenseId'].casecmp?(hsh['id']) } if license [{ 'rightsIdentifier' => license['licenseId'].downcase, 'rightsUri' => license['seeAlso'].first, 'rights' => license['name'], 'rightsIdentifierScheme' => 'SPDX', 'schemeUri' => 'https://spdx.org/licenses/' }.compact] else [{ 'rightsIdentifier' => hsh['id'], 'rightsURI' => hsh['url'] }.compact] end end def name_to_fos(name) # first find subject in Fields of Science (OECD) fos = JSON.load(File.read(File.expand_path('../../resources/oecd/fos-mappings.json', __dir__))).fetch('fosFields') subject = fos.find { |l| l['fosLabel'] == name || 'FOS: ' + l['fosLabel'] == name } if subject return [{ 'subject' => sanitize(name).downcase }, { 'subject' => 'FOS: ' + subject['fosLabel'], 'subjectScheme' => 'Fields of Science and Technology (FOS)', 'schemeUri' => 'http://www.oecd.org/science/inno/38235147.pdf' }] end # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification) # and map to Fields of Science. Add an extra entry for the latter fores = JSON.load(File.read(File.expand_path('../../resources/oecd/for-mappings.json', __dir__))) for_fields = fores.fetch('forFields') for_disciplines = fores.fetch('forDisciplines') subject = for_fields.find { |l| l['forLabel'] == name } || for_disciplines.find { |l| l['forLabel'] == name } if subject [{ 'subject' => sanitize(name).downcase }, { 'subject' => 'FOS: ' + subject['fosLabel'], 'subjectScheme' => 'Fields of Science and Technology (FOS)', 'schemeUri' => 'http://www.oecd.org/science/inno/38235147.pdf' }] else [{ 'subject' => sanitize(name).downcase }] end end def hsh_to_fos(hsh) # first find subject in Fields of Science (OECD) fos = JSON.load(File.read(File.expand_path('../../resources/oecd/fos-mappings.json', __dir__))).fetch('fosFields') subject = fos.find do |l| l['fosLabel'] == hsh['__content__'] || 'FOS: ' + l['fosLabel'] == hsh['__content__'] || l['fosLabel'] == hsh['subject'] end if subject return [{ 'subject' => sanitize(hsh['__content__'] || hsh['subject']), 'subjectScheme' => hsh['subjectScheme'], 'schemeUri' => hsh['schemeURI'] || hsh['schemeUri'], 'valueUri' => hsh['valueURI'] || hsh['valueUri'], 'classificationCode' => hsh['classificationCode'], 'lang' => hsh['lang'] }.compact, { 'subject' => 'FOS: ' + subject['fosLabel'], 'subjectScheme' => 'Fields of Science and Technology (FOS)', 'schemeUri' => 'http://www.oecd.org/science/inno/38235147.pdf' }.compact] end # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification) # and map to Fields of Science. Add an extra entry for the latter fores = JSON.load(File.read(File.expand_path('../../resources/oecd/for-mappings.json', __dir__))) for_fields = fores.fetch('forFields') for_disciplines = fores.fetch('forDisciplines') # try to extract forId if hsh['subjectScheme'] == 'FOR' for_id = hsh['__content__'].to_s.split(' ').first || hsh['subject'].to_s.split(' ').first for_id = for_id.rjust(6, '0') subject = for_fields.find { |l| l['forId'] == for_id } || for_disciplines.find { |l| l['forId'] == for_id[0..3] } else subject = for_fields.find do |l| l['forLabel'] == hsh['__content__'] || l['forLabel'] == hsh['subject'] end || for_disciplines.find do |l| l['forLabel'] == hsh['__content__'] || l['forLabel'] == hsh['subject'] end end if subject [{ 'subject' => sanitize(hsh['__content__'] || hsh['subject']), 'subjectScheme' => hsh['subjectScheme'], 'classificationCode' => hsh['classificationCode'], 'schemeUri' => hsh['schemeURI'] || hsh['schemeUri'], 'valueUri' => hsh['valueURI'] || hsh['valueUri'], 'lang' => hsh['lang'] }.compact, { 'subject' => 'FOS: ' + subject['fosLabel'], 'subjectScheme' => 'Fields of Science and Technology (FOS)', 'schemeUri' => 'http://www.oecd.org/science/inno/38235147.pdf' }] else [{ 'subject' => sanitize(hsh['__content__'] || hsh['subject']), 'subjectScheme' => hsh['subjectScheme'], 'classificationCode' => hsh['classificationCode'], 'schemeUri' => hsh['schemeURI'] || hsh['schemeUri'], 'valueUri' => hsh['valueURI'] || hsh['valueUri'], 'lang' => hsh['lang'] }.compact] end end def encode_doi(prefix) random_int = SecureRandom.random_number(2**63..(2**64) - 1) suffix = Base32::URL.encode(random_int) str = "#{suffix[0, 7]}-#{suffix[6, 7]}" "https://doi.org/#{prefix}/#{str}" end def decode_doi(doi) suffix = doi.split('/', 5).last Base32::URL.decode(suffix) end end end