# Fast, simple pseudonymisation of prescription data with a very controlled # format. # Only the first 2 fields are potentially identifiable: nhs number and date of # birth. require 'ndr_pseudonymise/simple_pseudonymisation' require 'ndr_pseudonymise/pseudonymisation_specification' require 'json' module NdrPseudonymise # Pseudonymise prescription data class PrescriptionPseudonymiser < PseudonymisationSpecification PREAMBLE_V2_DEMOG_ONLY = 'Pseudonymised matching data v2.0-demog-only'.freeze def initialize(format_spec, key_bundle) super return if @format_spec[:demographics] == [0, 1] raise 'Invalid specification: expected nhsnumber and birthdate in first 2 columns' end # Validate a row of prescription data # Return false if this row is a valid data row, otherwise a list of errors def row_errors2(row) # Not significantly faster than optimised general #row_errors method (nhsnumber, birthdate) = row[0..1] unless nhsnumber.is_a?(String) && nhsnumber =~ /\A([0-9]{10})?\Z/ raise 'Invalid NHS number' end raise 'Missing NHS number' if nhsnumber.size < 10 unless birthdate.is_a?(String) && birthdate =~ /\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z/ raise 'Invalid birthdate' end end # Pseudonymise a row of prescription data, returning an array of a single row: # [[packed_pseudoid_and_demographics, clinical_data1, ...]] # Where packed_pseudoid_and_demographics consists of # "pseudo_id1 (key_bundle) packed_pseudoid_and_demographics" def pseudonymise_row(row) @key_cache ||= {} # Cache pseudonymisation keys for more compact import all_demographics = { 'nhsnumber' => row[0], 'birthdate' => row[1] } key = all_demographics.to_json if @key_cache.key?(key) pseudo_id1, key_bundle, demog_key = @key_cache[key] else pseudo_id1, key_bundle, demog_key = NdrPseudonymise::SimplePseudonymisation. generate_keys_nhsnumber_demog_only(@salt1, @salt2, row[0]) if !row[0].to_s.empty? && !row[1].to_s.empty? # && false to stop caching @key_cache = {} if @key_cache.size > 10000 # Limit cache size @key_cache[key] = [pseudo_id1, key_bundle, demog_key] end end encrypted_demographics = NdrPseudonymise::SimplePseudonymisation. encrypt_data64(demog_key, all_demographics.to_json) packed_pseudoid_and_demographics = format('%s (%s) %s', pseudo_id1, key_bundle, encrypted_demographics) [[packed_pseudoid_and_demographics] + row[2..-1]] end # Header row for CSV data def csv_header_row [PREAMBLE_V2_DEMOG_ONLY] end # Append the output of pseudonymise_row to a CSV file def emit_csv_rows(out_csv, pseudonymised_row) out_csv << pseudonymised_row[0] end end end