# frozen_string_literal: true
# (c) Copyright 2020 Ribose Inc.
#
# rubocop:todo Style/RedundantRegexpEscape
require "English"
module Iev
# Parses information from the spreadsheet's SOURCE column.
#
# @example
# SourceParser.new(cell_data_string).parsed_sources
class SourceParser
include Cli::Ui
include Utilities
using DataConversions
attr_reader :src_split, :parsed_sources, :raw_str, :src_str
def initialize(source_str, term_domain)
@raw_str = source_str.dup.freeze
@src_str = raw_str.decode_html.sanitize.freeze
@term_domain = term_domain
parse
end
private
def parse
@src_split = split_source_field(src_str)
@parsed_sources = src_split.map { |src| extract_single_source(src) }
end
def split_source_field(source)
# TODO: Calling String#gsub with a single hash argument would be probably
# better than calling that method multiple times. But change is
# not necessarily that easy to do.
# IEC 62047-22:2014, 3.1.1, modified – In the definition, ...
source = source
.gsub(/;\s?([A-Z][A-Z])/, ';; \1')
.gsub(/MOD[,\.]/, "MOD;;")
# 702-01-02 MOD,ITU-R Rec. 431 MOD
# 161-06-01 MOD. ITU RR 139 MOD
source = source
.gsub(/MOD,\s*([UIC\d])/, 'MOD;; \1')
.gsub(/MOD[,\.]/, "MOD;;")
# 702-09-44 MOD, 723-07-47, voir 723-10-91
source = source
.gsub(/MOD,\s*(\d{3})/, 'MOD;; \1')
.gsub(/,\s*see\s*(\d{3})/, ';;see \1')
.gsub(/,\s*voir\s*(\d{3})/, ';;voir \1')
# IEC 62303:2008, 3.1, modified and IEC 62302:2007, 3.2; IAEA 4
# CEI 62303:2008, 3.1, modifiée et CEI 62302:2007, 3.2; AIEA 4
source = source
.gsub(/modified and ([ISOECUT])/, 'modified;; \1')
.gsub(/modifiée et ([ISOECUT])/, 'modifiée;; \1')
# 725-12-50, ITU RR 11
source = source.gsub(/,\s+ITU/, ";; ITU")
# 705-02-01, 702-02-07
source = source.gsub(
/(\d{2,3}-\d{2,3}-\d{2,3}),\s*(\d{2,3}-\d{2,3}-\d{2,3})/, '\1;; \2'
)
source.split(";;").map(&:strip)
end
def extract_single_source(raw_ref)
relation_type = extract_source_relationship(raw_ref)
clean_ref = normalize_ref_string(raw_ref)
source_ref = extract_source_ref(clean_ref)
clause = extract_source_clause(clean_ref)
{
"ref" => source_ref,
"clause" => clause,
"link" => obtain_source_link(source_ref),
"relationship" => relation_type,
"original" => Iev::Converter.mathml_to_asciimath(
parse_anchor_tag(raw_ref, @term_domain),
),
}.compact
rescue ::RelatonBib::RequestError => e
warn e.message
end
def normalize_ref_string(str)
# définition 3.60 de la 62127-1
# definition 3.60 of 62127-1
# définition 3.60 de la 62127-1
# definition 3.7 of IEC 62127-1 MOD, adapted from 4.2.9 of IEC 61828 and 3.6 of IEC 61102
# définition 3.7 de la CEI 62127-1 MOD, adaptées sur la base du 4.2.9 de la CEI 61828 et du 3.6 de la CEI 61102
# definition 3.54 of 62127-1 MOD
# définition 3.54 de la CEI 62127-1 MOD
# IEC 62313:2009, 3.6, modified
# IEC 62313:2009, 3.6, modifié
str
.gsub(/CEI/, "IEC")
.gsub(/Guide IEC/, "IEC Guide")
.gsub(%r{Guide ISO/IEC}, "ISO/IEC Guide")
.gsub(/VEI/, "IEV")
.gsub(/UIT/, "ITU")
.gsub(/IUT-R/, "ITU-R")
.gsub(/UTI-R/, "ITU-R")
.gsub(/Recomm[ea]ndation ITU-T/, "ITU-T Recommendation")
.gsub(/ITU-T (\w.\d{3}):(\d{4})/, 'ITU-T Recommendation \1 (\2)')
.gsub(/ITU-R Rec. (\d+)/, 'ITU-R Recommendation \1')
.gsub(/[≈≠]\s+/, "")
.sub(/ИЗМ\Z/, "MOD")
.sub(/definition ([\d\.]+) of ([\d\-\:]+) MOD/, 'IEC \2, \1, modified - ')
.sub(/definition ([\d\.]+) of IEC ([\d\-\:]+) MOD/, 'IEC \2, \1, modified - ')
.sub(/définition ([\d\.]+) de la ([\d\-\:]+) MOD/, 'IEC \2, \1, modified - ')
.sub(/définition ([\d\.]+) de la IEC ([\d\-\:]+) MOD/, 'IEC \2, \1, modified - ')
.sub(/(\d{3})\ (\d{2})\ (\d{2})/, '\1-\2-\3') # for 221 04 03
# .sub(/\A(from|d'après|voir la|see|See|voir|Voir)\s+/, "")
end
def extract_source_ref(str)
match_source_ref_string(str)
.sub(/, modifi(ed|é)\Z/, "")
.strip
end
def match_source_ref_string(str)
case str
when /SI Brochure/, /Brochure sur le SI/
# SI Brochure, 9th edition, 2019, 2.3.1
# SI Brochure, 9th edition, 2019, Appendix 1
# Brochure sur le SI, 9e édition, 2019, Annexe 1
"BBIPM SI Brochure TEMP DISABLED DUE TO RELATON"
when /VIM/
"JCGM VIM"
# IEC 60050-121, 151-12-05
when /IEC 60050-(\d+), (\d{2,3}-\d{2,3}-\d{2,3})/
"IEC 60050-#{::Regexp.last_match(1)}"
when /IEC 60050-(\d+):(\d+), (\d{2,3}-\d{2,3}-\d{2,3})/
"IEC 60050-#{::Regexp.last_match(1)}:#{::Regexp.last_match(2)}"
when /(AIEA|IAEA) (\d+)/
"IAEA #{::Regexp.last_match(2)}"
when /IEC\sIEEE ([\d\:\-]+)/
"IEC/IEEE #{::Regexp.last_match(1)}".sub(/:\Z/, "")
when /CISPR ([\d\:\-]+)/
"IEC CISPR #{::Regexp.last_match(1)}"
when /RR (\d+)/
"ITU-R RR"
# IEC 50(845)
when /IEC (\d+)\((\d+)\)/
"IEC 600#{::Regexp.last_match(1)}-#{::Regexp.last_match(1)}"
when %r{(ISO|IEC)[/\ ](PAS|TR|TS) ([\d\:\-]+)}
"#{::Regexp.last_match(1)}/#{::Regexp.last_match(2)} #{::Regexp.last_match(3)}".sub(
/:\Z/, ""
)
when %r{ISO/IEC ([\d\:\-]+)}
"ISO/IEC #{::Regexp.last_match(1)}".sub(/:\Z/, "")
when %r{ISO/IEC/IEEE ([\d\:\-]+)}
"ISO/IEC/IEEE #{::Regexp.last_match(1)}".sub(/:\Z/, "")
# ISO 140/4
when %r{ISO (\d+)/(\d+)}
"ISO #{::Regexp.last_match(1)}-#{::Regexp.last_match(2)}"
when /Norme ISO (\d+)-(\d+)/
"ISO #{::Regexp.last_match(1)}:#{::Regexp.last_match(2)}"
when %r{ISO/IEC Guide ([\d\:\-]+)}i
"ISO/IEC Guide #{::Regexp.last_match(1)}".sub(/:\Z/, "")
when /(ISO|IEC) Guide ([\d\:\-]+)/i
"#{::Regexp.last_match(1)} Guide #{::Regexp.last_match(2)}".sub(/:\Z/,
"")
# ITU-T Recommendation F.791 (11/2015)
when %r{ITU-T Recommendation (\w.\d+) \((\d+/\d+)\)}i
"ITU-T Recommendation #{::Regexp.last_match(1)} (#{::Regexp.last_match(2)})"
# ITU-T Recommendation F.791:2015
when /ITU-T Recommendation (\w.\d+):(\d+)/i
"ITU-T Recommendation #{::Regexp.last_match(1)} (#{::Regexp.last_match(2)})"
when /ITU-T Recommendation (\w\.\d+)/i
"ITU-T Recommendation #{::Regexp.last_match(1)}"
# ITU-R Recommendation 592 MOD
when /ITU-R Recommendation (\d+)/i
"ITU-R Recommendation #{::Regexp.last_match(1)}"
# ISO 669: 2000 3.1.16
when /ISO ([\d\-]+:\s?\d{4})/
"ISO #{::Regexp.last_match(1)}".sub(/:\Z/, "")
when /ISO ([\d\:\-]+)/
"ISO #{::Regexp.last_match(1)}".sub(/:\Z/, "")
when /IEC ([\d\:\-]+)/
"IEC #{::Regexp.last_match(1)}".sub(/:\Z/, "")
when /definition (\d\.[\d\.]+) of ([\d\-]*)/,
/définition (\d\.[\d\.]+) de la ([\d\-]*)/
"IEC #{::Regexp.last_match(2)}".sub(/:\Z/, "")
when /IEV (\d{2,3}-\d{2,3}-\d{2,3})/, /(\d{2,3}-\d{2,3}-\d{2,3})/
"IEV"
when /IEV part\s+(\d+)/, /partie\s+(\d+)\s+de l'IEV/
"IEC 60050-#{::Regexp.last_match(1)}"
when /International Telecommunication Union (ITU) Constitution/,
/Constitution de l’Union internationale des télécommunications (UIT)/
"International Telecommunication Union (ITU) Constitution (Ed. 2015)"
else
debug :sources, "Failed to parse source: '#{str}'"
str
end
end
def extract_source_clause(str)
# Strip out the modifications
str = str.sub(/[,\ ]*modif.+\s[-–].*\Z/, "")
# Strip these:
# see figure 466-6
# voir fig. 4.9
str = str.gsub(/\A(see|voir) fig. [\d\.]+/, "")
str = str.gsub(/\A(see|voir) figure [\d\.]+/, "")
# str = 'ITU-T Recommendation F.791:2015, 3.14,'
results = [
[/RR (\d+)/, "1"],
[/VIM (.+)/, "1"],
[/item (\d\.[\d\.]+)/, "1"],
[/d[eé]finition (\d[\d\.]+)/, "1"],
[/figure ([\d\.\-]+)/, "figure 1"],
[/fig\. ([\d\.\-]+)/, "figure 1"],
[/IEV (\d{2,3}-\d{2,3}-\d{2,3})/, "1"],
[/(\d{2,3}-\d{2,3}-\d{2,3})/, "1"],
# 221 04 03
[/(\d{3}\ \d{2}\ \d{2})/, "1"],
# ", 1.1"
# "SI Brochure, 9th edition, 2019, 2.3.1,"
[/,\s?(\d+\.[\d\.]+)/, "1"],
# SI Brochure, 9th edition, 2019, Appendix 1, modified
# Brochure sur le SI, 9e édition, 2019, Annexe 1,
[/\d{4}, (Appendix \d)/, "1"],
[/\d{4}, (Annexe \d)/, "1"],
# International Telecommunication Union (ITU) Constitution (Ed. 2015), No. 1012 of the Annex,
# Constitution de l’Union internationale des télécommunications (UIT) (Ed. 2015), N° 1012 de l’Annexe,
[/, (No. \d{4} of the Annex)/, "1"],
[/, (N° \d{4} 1012 de l’Annexe)/, "1"],
# ISO/IEC 2382:2015 (https://www.iso.org/obp/ui/#iso:std:iso-iec:2382:ed-1:v1:en), 2126371
[/\), (\d{7}),/, "1"],
# " 1.1 "
[/\s(\d+\.[\d\.]+)\s?/, "1"],
# "ISO/IEC Guide 2 (14.1)"
[/\((\d+\.[\d\.]+)\)/, "1"],
# "ISO/IEC Guide 2 (14.5 MOD)"
[/\((\d+\.[\d\.]+)\ MOD\)/, "1"],
# ISO 80000-10:2009, item 10-2.b,
# ISO 80000-10:2009, point 10-2.b,
[/\AISO 80000-10:2009, (item [\d\.\-]+\w?)/, "1"],
[/\AISO 80000-10:2009, (point [\d\.\-]+\w?)/, "1"],
# IEC 80000-13:2008, 13-9,
[/\AIEC 80000-13:2008, ([\d\.\-]+\w?),/, "1"],
[/\AIEC 80000-13:2008, ([\d\.\-]+\w?)\Z/, "1"],
# ISO 921:1997, definition 6,
# ISO 921:1997, définition 6,
[/\AISO [\d:]+, (d[ée]finition \d+)/, "1"],
# "ISO/IEC/IEEE 24765:2010, Systems and software engineering – Vocabulary, 3.234 (2)
[/, ([\d\.\w]+ \(\d+\))/, "1"],
].map do |regex, _rule|
# TODO: Rubocop complains about unused rule -- need to make sure
# that no one forgot about something.
res = []
# puts "str is '#{str}'"
# puts "regex is '#{regex.to_s}'"
str.scan(regex).each do |result|
# puts "result is #{result.first}"
res << {
index: $LAST_MATCH_INFO.offset(0)[0],
clause: result.first.strip,
}
end
res
# sort by index and also the length of match
end.flatten.sort_by { |hash| [hash[:index], -hash[:clause].length] }
# pp results
results.dig(0, :clause)
end
def extract_source_relationship(str)
type = case str
when /≠/
:not_equal
when /≈/
:similar
when /^([Ss]ee)|([Vv]oir)/
:related
when /MOD/, /ИЗМ/
:modified
when /modified/, /modifié/
:modified
when /^(from|d'après)/,
/^(definition (.+) of)|(définition (.+) de la)/
:identical
else
:identical
end
case str
when /^MOD ([\d\-])/
{
"type" => type.to_s,
}
when /(modified|modifié|modifiée|modifiés|MOD)\s*[–-]?\s+(.+)\Z/
{
"type" => type.to_s,
"modification" => Iev::Converter.mathml_to_asciimath(
parse_anchor_tag(::Regexp.last_match(2), @term_domain),
).strip,
}
else
{
"type" => type.to_s,
}
end
end
# Uses Relaton to obtain link for given source ref.
def obtain_source_link(ref)
RelatonDb.instance.fetch(ref)&.url
end
end
end
# rubocop:enable Style/RedundantRegexpEscape