lib/rbbt/util/tsv/attach.rb in rbbt-util-3.0.2 vs lib/rbbt/util/tsv/attach.rb in rbbt-util-3.0.3
- old
+ new
@@ -1,6 +1,48 @@
class TSV
+ def self.merge_rows(input, output, sep = "\t")
+ is = case
+ when (String === input and not input.index("\n") and input.length < 250 and File.exists?(input))
+ CMD.cmd("sort -k1,1 -t'#{sep}' #{ input } | grep -v '^#{sep}' ", :pipe => true)
+ when (String === input or StringIO === input)
+ CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => input, :pipe => true)
+ else
+ input
+ end
+
+ current_key = nil
+ current_parts = []
+
+ done = false
+ Open.write(output) do |os|
+
+ done = is.eof?
+ while not done
+ key, *parts = is.gets.sub("\n",'').split(sep, -1)
+ current_key ||= key
+ case
+ when key.nil?
+ when current_key == key
+ parts.each_with_index do |part,i|
+ if current_parts[i].nil?
+ current_parts[i] = part
+ else
+ current_parts[i] = current_parts[i] << "|" << part
+ end
+ end
+ when current_key != key
+ os.puts [current_key, current_parts].flatten * sep
+ current_key = key
+ current_parts = parts
+ end
+
+ done = is.eof?
+ end
+
+ end
+ end
+
def self.paste_merge(file1, file2, output, sep = "\t")
case
when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1))
file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true)
when (String === file1 or StringIO === file1)
@@ -184,11 +226,11 @@
through do |key, values|
if other.include? key
new_values = other[key].values_at *fields
new_values.collect!{|v| [v]} if type == :double and not other.type == :double
- new_values.collect!{|v| v.first} if not type == :double and other.type == :double
+ new_values.collect!{|v| v.nil? ? nil : v.first} if not type == :double and other.type == :double
self[key] = self[key].concat new_values
else
if type == :double
self[key] = self[key].concat [[]] * fields.length
else
@@ -221,12 +263,12 @@
else
other[source_key][pos]
end
end
- new_values.collect!{|v| [v]} if type == :double and not other.type == :double
- new_values.collect!{|v| v.first} if not type == :double and other.type == :double
+ new_values.collect!{|v| [v]} if type == :double and not other.type == :double
+ new_values.collect!{|v| v.nil? ? nil : v.first} if not type == :double and other.type == :double
all_new_values << new_values
end
end
if all_new_values.empty?
@@ -272,11 +314,11 @@
else
other[source_key][pos]
end
end
new_values.collect!{|v| [v]} if type == :double and not other.type == :double
- new_values.collect!{|v| v.first} if not type == :double and other.type == :double
+ new_values.collect!{|v| v.nil? ? nil : v.first} if not type == :double and other.type == :double
all_new_values << new_values
end
end
if all_new_values.empty?
@@ -328,39 +370,55 @@
id_list.zip(files[0..-1])
end
end
def self.build_traverse_index(files, options = {})
- options = Misc.add_defaults options, :in_namespace => false, :persist_input => false
- in_namespace = options[:in_namespace]
+ options = Misc.add_defaults options, :in_namespace => false, :persist_input => false
+ in_namespace = options[:in_namespace]
persist_input = options[:persist_input]
path = find_path(files, options)
return nil if path.nil?
traversal_ids = path.collect{|p| p.first}
-
+
Log.medium "Found Traversal: #{traversal_ids * " => "}"
+
+ data_key, data_file = path.shift
+ if data_key == data_file.key_field
+ Log.debug "Data index not required '#{data_file.key_field}' => '#{data_key}'"
+ data_index = nil
+ else
+ Log.debug "Data index required"
+ data_index = data_file.index :target => data_key, :fields => data_file.key_field, :persistence => false
+ end
- current_id, current_file = path.shift
- current_key = current_file.all_fields.first
-
- index = current_file.index :target => current_id, :fields => current_key, :persistence => persist_input
-
+ current_index = data_index
+ current_key = data_key
while not path.empty?
- current_id, current_file = path.shift
- current_index = current_file.index :target => current_id, :fields => index.fields.first, :persistence => true
- index.process 0 do |value|
- current_index.values_at(*value).flatten.uniq
+ next_key, next_file = path.shift
+
+ if current_index.nil?
+ current_index = next_file.index :target => next_key, :fields => current_key, :persistence => persist_input
+ else
+ next_index = next_file.index :target => next_key, :fields => current_key, :persistence => persist_input
+ current_index.process current_index.fields.first do |key, values, values|
+ if values.nil?
+ nil
+ else
+ next_index.values_at(*values).flatten.collect
+ end
+ end
+ current_index.fields = [next_key]
end
- index.fields = current_index.fields
end
- index
+ current_index
end
+
def self.find_traversal(tsv1, tsv2, options = {})
options = Misc.add_defaults options, :in_namespace => false
in_namespace = options[:in_namespace]
identifiers1 = tsv1.identifier_files || []
@@ -386,17 +444,26 @@
def attach(other, fields = nil, options = {})
options = Misc.add_defaults options, :in_namespace => false
in_namespace = options[:in_namespace]
fields = other.fields - [key_field].concat(self.fields) if fields == :all
- fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil?
+ if in_namespace
+ fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil?
+ else
+ fields = other.fields - [key_field].concat(self.fields) if fields.nil?
+ end
+
Log.high("Attaching fields:#{fields.inspect} from #{other.filename.inspect}.")
+
+ other = other.tsv(:persistence => options[:persist_input] == true) unless TSV === other
case
when key_field == other.key_field
attach_same_key other, fields
when (not in_namespace and self.fields.include?(other.key_field))
+ Log.medium "Found other's key field: #{other.key_field}"
attach_source_key other, other.key_field, fields
when (in_namespace and self.fields_in_namespace.include?(other.key_field))
+ Log.medium "Found other's key field in #{in_namespace}: #{other.key_field}"
attach_source_key other, other.key_field, fields
else
index = TSV.find_traversal(self, other, options)
raise "Cannot traverse identifiers" if index.nil?
attach_index other, index, fields