lib/rbbt/util/tsv/attach.rb in rbbt-util-3.0.2 vs lib/rbbt/util/tsv/attach.rb in rbbt-util-3.0.3

- old
+ new

@@ -1,6 +1,48 @@ class TSV + def self.merge_rows(input, output, sep = "\t") + is = case + when (String === input and not input.index("\n") and input.length < 250 and File.exists?(input)) + CMD.cmd("sort -k1,1 -t'#{sep}' #{ input } | grep -v '^#{sep}' ", :pipe => true) + when (String === input or StringIO === input) + CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => input, :pipe => true) + else + input + end + + current_key = nil + current_parts = [] + + done = false + Open.write(output) do |os| + + done = is.eof? + while not done + key, *parts = is.gets.sub("\n",'').split(sep, -1) + current_key ||= key + case + when key.nil? + when current_key == key + parts.each_with_index do |part,i| + if current_parts[i].nil? + current_parts[i] = part + else + current_parts[i] = current_parts[i] << "|" << part + end + end + when current_key != key + os.puts [current_key, current_parts].flatten * sep + current_key = key + current_parts = parts + end + + done = is.eof? + end + + end + end + def self.paste_merge(file1, file2, output, sep = "\t") case when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1)) file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true) when (String === file1 or StringIO === file1) @@ -184,11 +226,11 @@ through do |key, values| if other.include? key new_values = other[key].values_at *fields new_values.collect!{|v| [v]} if type == :double and not other.type == :double - new_values.collect!{|v| v.first} if not type == :double and other.type == :double + new_values.collect!{|v| v.nil? ? nil : v.first} if not type == :double and other.type == :double self[key] = self[key].concat new_values else if type == :double self[key] = self[key].concat [[]] * fields.length else @@ -221,12 +263,12 @@ else other[source_key][pos] end end - new_values.collect!{|v| [v]} if type == :double and not other.type == :double - new_values.collect!{|v| v.first} if not type == :double and other.type == :double + new_values.collect!{|v| [v]} if type == :double and not other.type == :double + new_values.collect!{|v| v.nil? ? nil : v.first} if not type == :double and other.type == :double all_new_values << new_values end end if all_new_values.empty? @@ -272,11 +314,11 @@ else other[source_key][pos] end end new_values.collect!{|v| [v]} if type == :double and not other.type == :double - new_values.collect!{|v| v.first} if not type == :double and other.type == :double + new_values.collect!{|v| v.nil? ? nil : v.first} if not type == :double and other.type == :double all_new_values << new_values end end if all_new_values.empty? @@ -328,39 +370,55 @@ id_list.zip(files[0..-1]) end end def self.build_traverse_index(files, options = {}) - options = Misc.add_defaults options, :in_namespace => false, :persist_input => false - in_namespace = options[:in_namespace] + options = Misc.add_defaults options, :in_namespace => false, :persist_input => false + in_namespace = options[:in_namespace] persist_input = options[:persist_input] path = find_path(files, options) return nil if path.nil? traversal_ids = path.collect{|p| p.first} - + Log.medium "Found Traversal: #{traversal_ids * " => "}" + + data_key, data_file = path.shift + if data_key == data_file.key_field + Log.debug "Data index not required '#{data_file.key_field}' => '#{data_key}'" + data_index = nil + else + Log.debug "Data index required" + data_index = data_file.index :target => data_key, :fields => data_file.key_field, :persistence => false + end - current_id, current_file = path.shift - current_key = current_file.all_fields.first - - index = current_file.index :target => current_id, :fields => current_key, :persistence => persist_input - + current_index = data_index + current_key = data_key while not path.empty? - current_id, current_file = path.shift - current_index = current_file.index :target => current_id, :fields => index.fields.first, :persistence => true - index.process 0 do |value| - current_index.values_at(*value).flatten.uniq + next_key, next_file = path.shift + + if current_index.nil? + current_index = next_file.index :target => next_key, :fields => current_key, :persistence => persist_input + else + next_index = next_file.index :target => next_key, :fields => current_key, :persistence => persist_input + current_index.process current_index.fields.first do |key, values, values| + if values.nil? + nil + else + next_index.values_at(*values).flatten.collect + end + end + current_index.fields = [next_key] end - index.fields = current_index.fields end - index + current_index end + def self.find_traversal(tsv1, tsv2, options = {}) options = Misc.add_defaults options, :in_namespace => false in_namespace = options[:in_namespace] identifiers1 = tsv1.identifier_files || [] @@ -386,17 +444,26 @@ def attach(other, fields = nil, options = {}) options = Misc.add_defaults options, :in_namespace => false in_namespace = options[:in_namespace] fields = other.fields - [key_field].concat(self.fields) if fields == :all - fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil? + if in_namespace + fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil? + else + fields = other.fields - [key_field].concat(self.fields) if fields.nil? + end + Log.high("Attaching fields:#{fields.inspect} from #{other.filename.inspect}.") + + other = other.tsv(:persistence => options[:persist_input] == true) unless TSV === other case when key_field == other.key_field attach_same_key other, fields when (not in_namespace and self.fields.include?(other.key_field)) + Log.medium "Found other's key field: #{other.key_field}" attach_source_key other, other.key_field, fields when (in_namespace and self.fields_in_namespace.include?(other.key_field)) + Log.medium "Found other's key field in #{in_namespace}: #{other.key_field}" attach_source_key other, other.key_field, fields else index = TSV.find_traversal(self, other, options) raise "Cannot traverse identifiers" if index.nil? attach_index other, index, fields