# # colspecs.rb # # Specification of the column names that have predefined meaning # when encountered in a input file. # require 'set' require 'yaml' require 'nodepile/keyed_array.rb' require 'nodepile/base_structs.rb' module Nodepile # This class provides information about the valid columns for potential use # in documentation and also provides facilities for doing per-line verification # of column values within a single line. # that can appear on a non-header line of an input file. # Note that the best way to think of this class is as a scanner which is in some sense # stateless # # Records generated by the #parse method and related methods will by default # set metadata fields, particularly including: # '@type' = :node, :edge, :rule, :pragma # '@key' = String or [String,String] for node or edge respectively class InputColumnSpecs class InvalidRecordError < StandardError attr_accessor :rec_num,:file_path # use to add error detail def initialize(msg) = @msg = msg def message prefix = "Nodepile parsing error at record [#{self.rec_num||'?'}] from source [#{self.file_path||'?'}]: " return (!self.rec_num.nil? || !self.file_path.nil?) ? (prefix + @msg) : @msg end end # parsing errors throw this DEFAULT_ID_DELIMITER = ',' # may be used in _link_from and _link_to for multiple edges DEFAULT_PRAGMA_MARKER = "#pragma " public # Provide a simple hash of field names and their meaning/use. def self.coldefs @@class_mcache.cache(__method__){|| h = YAML.load(defined?(DATA) ? DATA.read : /__END__\s+(.*)/m.match(File.read(__FILE__))[1] )['data']['fields'] h # this value is cached } end def self.val_is_pattern?(s) s[0] == '/' ? :pattern : nil end # List the most crucial columns that indicate the existence of # nodes, edges, and styling instructions. def self.id_cols; %w(_id _links_from _links_to); end #do not reorder def self.all_cols; coldefs().keys; end # Defines the characters that will be interpreted as delimiting entity "id" # values. attr_accessor :id_delimiter # Creates a customized InputColumnSpecs object based on the column names and order # that are included in one specific file. That object can then be used # ONLY to validate that specific file. See the #coldefs # @param col_names[Array] Order of column data expected from calls to # #validate # @param id_delimiter[String] Indicates a character that will be considered # a delimiter between ids so that multiple may occupy the # field # @param pragmas[String,nil] If nil, "pragmas" are not identified. # If true, then when the _id field is started with the # "#pragma", it is identified as a pragma and made available # through the #each_pragma method. If a string, then # any record whose _id column starts with that string is # considered a pragma. Note that ONLY the _id column of # a pragma record is captured. # @param metadata_key_prefix [String,nil] During #parse and related methods # records are yielded in the form of KeyedArrayAccessor objects # that have both the loaded data and also metadata about the # records such as the type of the entity and whether its # existence was triggered explicitly or implicitly. This # value is is passed to the KeyedArrayAccessor. # @raise InvalidRecordError def initialize(col_names,id_delimiter: DEFAULT_ID_DELIMITER,pragmas: DEFAULT_PRAGMA_MARKER, metadata_key_prefix: '@') @col_names = col_names.dup.freeze @id_cols_indices = self.class.id_cols.map{|cnm| @col_names.find_index(cnm)}.freeze @id_delimiter = id_delimiter @pragma_marker = pragmas @empty_kv_array = KeyedArrayAccessor.new(@col_names,Array.new(@col_names.length).freeze) raise InvalidRecordError.new(<<~ERRMSG) if @id_cols_indices[0].nil? A valid record set must contain an '_id' column ERRMSG @metadata_key_prefix = metadata_key_prefix @md_pfxs = [(@metadata_key_prefix||'')+'type', (@metadata_key_prefix||'')+'key', (@metadata_key_prefix||'')+'is_implied', ] @mc = CrudeCalculationCache.new end # Given a string representing the contents of the "_id", "_links_to", or "_links_from" field, # this method will split it into zero or more tokens representing either ids or # or else patterns. Patterns start with the question mark character. # Leading and trailing spaces are stripped before return. # @param id_containing_field [String] Any of the possible id containing fields # @return [Array] zero or more def split_ids(id_containing_field, &block) # very simple implementation (make smarter later???) return [] if id_containing_field.nil? return enum_for(:split_ids,id_containing_field) unless block_given? raise "A field containing a rule calculation may not contain other ids" if /,\s*\?/ =~ id_containing_field id_containing_field.split(@id_delimiter).tap{|a2| a2.each{|s| s.strip! yield s unless s == '' } } end # Given a single "record" (which may define zero or more entities or contain errors) # this method will yield once for each "entity" or "rule" that may be inferred # by that record. The "entities" defined by a given record are determined by # three fields: _id, _links_from, and _links_to. # # The entries in these fields can indicate several things: # 1) The explicit existence and attribute values for a node # 2) Override values for a node or pattern of nodes # 3) The implicit existence of a node (because an explicit node links explicitly to/from it) # 4) The explicit existence of an edge (because an edge is explicitly in the to/from fields) # and attribute values for the edge # 5) The implicit existence of an edge (because an edge is implied by a rule in the to/from) # and attribute values for the edge # # Note that when metadata is attached to the KeyedArrayAccessors, it the metada will # be updated to include the following key-values. # * 'type' = :node, :edge, :rule, :pragma # * 'key' = either a single String of nodes/node-rules or an array of two strings for edges # and edge-rules # * 'is_implied' = true,nil to indicate whether the entity is implied # # @param col_value_array [Array] Column values in exact order of column names # provided when this object was constructed. # @return [Integer] Number of entities encountered. Note that zero is valid. # @param metadata [Hash,nil] If provided, the given metadata will be attached to each of the # KeyedArrayAccessors that are yielded along with metadata about # this particular entity. Note that the hash passed in will be altered # in two ways. Firstly, if a @metadata_key_prefix is specified, all keys # will be changed to include this prefix (if they aren't already). # Secondly, the three additional metadata key-values will be added # (type, key, is_implied). # @param metadata_key_prefix [String,nil] See KeyedArrayAccessor#initialize for detail. # If provided, this string will be foreced to appear at the beginning # of every metadata key. # @param source [String,nil,Object] see KeyedArrayAccessor#initialize for detail # @param ref_num [Integer,nil] see KeyedArrayAccessor#iniialize for detail # @raise [InvalidRecordError] If errors or omissions in data make it uninterpretable # @yieldparam [Nodepile::KeyedArrayAccessor] A single node, edge, or rule taken extracted # from the record. Note that the id, links_to, and links_from # fields may be altered in the return value. def parse(col_value_array,source: nil, ref_num: nil,metadata: nil,&entity_receiver) #see below in this file for the various preprocessing defined _preprocs.each{|(ix,preproc_block)| col_value_array[ix] = preproc_block.call(col_value_array[ix]) } _validators.each{|(vl_col_nums,val_block)| errmsg = val_block.call(*vl_col_nums.map{|i| i && col_value_array[i]}) # test the specified column values raise InvalidRecordError.new(errmsg) if errmsg } if metadata && (@metadata_key_prefix||'') != '' # if necessary, facilitate quick attachment of metadata to KeyedArrayAccessor metadata.transform_keys{|k| k.start_with?(@metadata_key_prefix) ? k : @metadata_key_prefix + k} end metadata ||= Hash.new # following proc is used to package up the return value at multiplel places below yieldval_bldr = Proc.new{|kaa,*three_md_fields| (0..(@md_pfxs.length-1)).each{|i| metadata[@md_pfxs[i]] = three_md_fields[i]} kaa.reset_metadata(metadata,metadata_key_prefix: @metadata_key_prefix) kaa } ids, links_from, links_to = @id_cols_indices.map{|i| i && col_value_array[i]} return 0 if ids&.start_with?('#') # ignore these records base_kva = KeyedArrayAccessor.new(@col_names, col_value_array, source: source, ref_num: ref_num) if @pragma_marker && ids&.start_with?(@pragma_marker) # pragmas get shortcut treatment, not keyed, ignore all other columns yield yieldval_bldr(base_kva,:pragma,nil,false) if block_given? return 1 # pragmas do not have links, or multiple ids end entity_count = 0 lf_list = split_ids(links_from).to_a lt_list = split_ids(links_to).to_a if !ids.nil? edge_list = Array.new else # for pure edges, add them to list for later yielding edge_list = lf_list.to_a.product(lt_list.to_a) .map{|(lf,lt)| kva = base_kva.dup kva['_links_from'] = lf kva['_links_to'] = lt [lf,lt,kva ] } end #detecting pure edges split_ids(ids).each{|id| kva = base_kva.dup.tap{|kva| kva['_id'] = id kva['_links_from'] = nil kva['_links_to'] = nil } entity_count += 1 yield yieldval_bldr.call(kva,id[0] == '?' ? :rule : :node,id.freeze,false) if block_given? # emit any implicitly existing nodes (lf_list + lt_list).each{|link| if !link.start_with?('?') entity_count += 1 # implied nodes have cleared value except their key kva = base_kva.dup.tap{|x| x['_id'] = link x['_links_from'] = nil x['_links_to'] = nil } yield yieldval_bldr.call(kva,:node,link.freeze,true) if block_given? end } # Flag edges the go from/to _id. Note, you can't define rules this way. (lf_list.product([id]) + [id].product(lt_list)).each{|a| next if a.any?{|v| v.start_with?('?')} # rules can't imply an edge kva = @empty_kv_array.dup kva['_links_from'] = a[0] kva['_links_to'] = a[1] kva.source = base_kva.source kva.ref_num = base_kva.ref_num edge_list << [a[0],a[1],kva] } } edge_list.each{|(n1,n2,kva)| entity_count += 1 et = (n1.start_with?('?') || n2.start_with?('?')) ? :rule : :edge yield yieldval_bldr.call(kva,et,[n1,n2].freeze,false) if block_given? } return entity_count end # Bulk parse is a convenience method for parsing a source of records. It is essentially # the same as instantiating an object using the first record and then calling parse multiple times # # For information on most of the parameters, see the #parse method # # @param rec_source [Enumerable>] first record is presumed to be # the header and all other lines will be forced into the #parse # method. # @return [Integer, Enumerator] If a block is passed in, returns the total of all # entities that were yielded from the source. Otherwise # returns an enumerator. # def self.bulk_parse(rec_source,source: nil,metadata: nil, metadata_key_prefix: nil, &entity_receiver) return enum_for(:bulk_parse,rec_source, source:, metadata:, metadata_key_prefix:) unless block_given? hdr_vals = rec_source.next specs = InputColumnSpecs.new(hdr_vals) rec_count = 0 begin loop do next_rec = rec_source.next rec_count += specs.parse(next_rec,source:, ref_num: rec_count+2,metadata:,&entity_receiver) end rescue StopIteration #no-op end return rec_count end # Utility class returned by the #make_pattern_match_verifier() method # # It holds tests that can be used to confirm whether a pattern matches # aspects of a given node. # # Example Pattern Strings: # 1) "?/^alpha/" matches type == :node where key starts with "alpha" # 2) "beta" mates type == :node where key is exactly "beta" # class PatternMatchVerifier ALWAYS_TRUE_PROC = Proc.new{true} def initialize(pattern_string) @non_id_test = ALWAYS_TRUE_PROC @id_test = nil @pattern_string = pattern_string case pattern_string when /^\s*\?\s*\/(.*)\/\s*$/ rx = Regexp.new($1) @id_test = Proc.new{|id| rx.match?(id)} else exact_id = pattern_string.strip # match with the exact (trimmed) string @id_test = Proc.new{|id| id == exact_id } end end #initialize def inspect = "#<#{self.class} 0x#{object_id} pattern_string=#{@pattern_string.inspect}> " # Exclusively test whether the given node id would be acceptable for this # verifier. # # @param test_id_string [String] def id_match?(test_id_string) = @id_test.call(test_id_string) # Exclusively test whether any of the non-id aspects of the node would be # acceptable for this verifier. # @param node_entity_packet [Nodepile::EntityPacket] def non_id_match?(node_entity_packet) = @non_id_test.call(node_entity_packet) # Perform both the id_match?() and return their logical AND def match?(nep) = id_match?(nep.key) && non_id_match?(nep) end #class PatternMatchVerifier # "Rule" type entities are characterized by having one or more "patterns" # that are used to determine which of the nodes a given rule should apply to. # Most often, the patterns specify sets of node IDs would satisfy them # such as through regular expression matching. However, future instances # may use field values to determine matching. # # For explanation of pattern logic see the PatternMatchVerifier class # def self.make_pattern_match_verifier(pattern_string) return PatternMatchVerifier.new(pattern_string) end private def _preprocs @mc.cache(__method__){|| # collect preproc relevant for the columns present my_preprocs = Array.new @col_names.each_with_index{|nm,ix| self.class._all_preprocs[nm]&.tap{|(skip_nil,block)| my_preprocs << [ix,block].freeze } } my_preprocs.freeze # will get cached } # end cache calculator end # _preprocs() def _validators @mc.cache(__method__){|| my_validators = Array.new # collect validators relevant for the columns present self.class._all_validators.each{|(always,vl_col_names,block)| vl_col_nums = vl_col_names.map{|nm| @col_names.find_index(nm) } if always || vl_col_nums.none?{|v| v.nil?} my_validators << [vl_col_nums.freeze,block].freeze end } my_validators.freeze # this should get cached } # end cache calculator end # A validator is a block used to verify the values in a specific # set of fields. The blocks registered here are compiled into calls # to InputColumnSpecs#new. A validator block should evaluate to nil # if everything is okay. If it evaluates to a string, that string may be # communicated to users as a validation failure. # # @param always [true,false,nil] Indicates that the validator should # be run regardless of whether all fields # are present. Nils will be passed # to the validator for missing fields. # @param col_name_array [Array] These fields must be passed # in this order to the block to # perform the validation. def self._make_validator(col_name_array,always: nil, &validator_block) [always,col_name_array.dup.freeze,validator_block].freeze end # Package up field preprocessing into a record for later use # @param skip_nil [Boolean] if true, does no preprocessing if the field value # is nil def self._make_field_preproc(col_name, skip_nil: true, &preproc_block) return [col_name,skip_nil ? Proc.new{|s| s && preproc_block.(s)} : preproc_block] end # Package up preprocs for a field using some standard rules. Multiple # rules may apply to the same field. # # * :strip will cause leading and trailing spaces to be removed and # blank fields will be set to nil # * :downcase will cause field contents to be downcased # @param col_name [String] name of the column the preproc applies to # @param std_syms [Array,nil] one or more symbols representing # the preprocs that should be combined. They area applied # in the specified order although is :strip is present it # must appear first. Method is a no-op of the std_syms is # nil def self._make_standard_preproc(col_name,std_syms) return nil if std_syms.nil? nproc = Proc.new{|s| std_syms.each{|instr| case instr when :downcase then s&.downcase! when :strip then s = nil if (s.strip!||s) == '' else raise "Unrecognized preproc found [#{proc_sym.inspect}]" end } #each instruction next s # "return value" } #nproc _make_field_preproc(col_name,skip_nil: true,&nproc) end @@class_mcache = CrudeCalculationCache.new def self._all_preprocs @@class_mcache.cache(__method__){|| h = Hash.new # append to this array # generate preprocs using the flags in the YAML at bottom of this file coldefs.each_pair{|fieldname,fielddata| h[fieldname] = _make_standard_preproc(fieldname,fielddata['preproc']&.map(&:to_sym)) } h.freeze # this Hash will get cached } # end cache calculator end # _all_preprocs() def self._all_validators @@class_mcache.cache(__method__){|| a = Array.new a << _make_validator(['_id','_links_from','_links_to'],always: true){|id,lf,lt| if id.nil? && (lf.nil? ^ lt.nil?) next 'If the _id field is blank, both _links_from and links_to fields must be blank or both populated' end if id&.start_with?('?') && (lf || lt) next 'If the _id field indicates a :rule, _links_from and _links_to must be blank' end if id && (lf&.start_with?('?') || lt&.start_with?('?')) next "If the _id field is populated, you may not put a rule formula in _links_from or _links_to" end next nil } a.freeze # this Array will get cached } # end cache calculator end # _all_validators() end # class InputColumnSpecs end # module Nodepile # Below are the column spec to be used for documentation and to some degree __END__ --- data: fields: _id: description: > Required column in any input file. Can be one of three value types. If it starts with a literal asterisk character or with a literal forward slash character, it indicates the line is a style instruction. If it is blank or whitespace, it indicates the line defines an edge or edge style instruction. Any other value indicates that this line defines a node and the value in this column is interpreted as a unique identifier (node_id) that can be used on other lines to reference this node. Unless otherwise overridden, the _id is used to label the node. Note that your life may be happier if you forbid using commas as part of _id values although it is not forbidden. preproc: - strip _links_from: description: > Required if the _id field has been left blank. Specifies one or more node_id values separated by valid delimiter characters. If the first character is an asterisk or forward slash, it indicates that this is a edge styling instruction. Otherwise, this is used to indicate the existence of one or more edges originating from the specified node. preproc: - strip _links_to: description: Follows same protocol as _link_from. preproc: - strip _label: description: > Indicates a (typically short) label that should appear rather than value of the _id for nodes and edges. For nodes, see also _labelNN which allows specifying node labels in a line-by-line format. #preproc: # no preproc for this one... deliberate blanks may be meaningful _labelNN: description: > If present and non-blank, this value supercedes any text in _label column. When a column with this pattern is specified it should replace _labelNN with a integer such as _label3 or _label22 to indicate that the provided text appears on line 3 or line 22 respectively. _color: decription: > For nodes, color is the border color of the shape. For edges, this is the actual edge color. There are a very wide variety of ways that color can be specified. Any format supported by the DOT language is permitted. The rock bottom simplest is to use the supported set of simple color words like red, blue, etc. dot_ref: https://graphviz.org/docs/attrs/color/ preproc: - strip - downcase _fillcolor: description: > For nodes, fillcolor is the background color of the shape. dot_ref: https://graphviz.org/docs/attrs/fillcolor/ preproc: - strip - downcase _fontcolor: description: > For many entities, defines the text color. dot_ref: https://graphviz.org/docs/attrs/fontcolor/ preproc: - strip - downcase _shape: description: > For nodes, determines the shape of the node. Shape names tend to be either simple things like (box, plain, plaintext, circle, ellipse, etc.) or else it is a record type that is meant to render data in a structured layout. dot_ref: https://graphviz.org/docs/attr-types/shape/ preproc: - strip - downcase