lib/rbbt/util/tsv/attach.rb in rbbt-util-2.1.0 vs lib/rbbt/util/tsv/attach.rb in rbbt-util-3.0.2

- old
+ new

@@ -1,7 +1,184 @@ class TSV + def self.paste_merge(file1, file2, output, sep = "\t") + case + when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1)) + file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true) + when (String === file1 or StringIO === file1) + file1 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1, :pipe => true) + end + case + when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2)) + file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } | grep -v '^#{sep}' ", :pipe => true) + when (String === file2 or StringIO === file2) + file2 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file2, :pipe => true) + end + + output = File.open(output, 'w') if String === output + + cols1 = nil + cols2 = nil + + done1 = false + done2 = false + + key1 = key2 = nil + while key1.nil? + while (line1 = file1.gets) =~ /#/; end + key1, *parts1 = line1.sub("\n",'').split(sep, -1) + cols1 = parts1.length + end + + while key2.nil? + while (line2 = file2.gets) =~ /#/; end + key2, *parts2 = line2.sub("\n",'').split(sep, -1) + cols2 = parts2.length + end + + key = key1 < key2 ? key1 : key2 + parts = [""] * (cols1 + cols2) + while not (done1 and done2) + while (not done1 and key1 == key) + parts1.each_with_index do |part, i| + parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part + end + key1 = nil + while key1.nil? and not done1 + if file1.eof?; done1 = true; else key1, *parts1 = file1.gets.sub("\n",'').split(sep, -1) end + end + end + while (not done2 and key2 == key) + parts2.each_with_index do |part, i| + i += cols1 + parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part + end + key2 = nil + while key2.nil? and not done2 + if file2.eof?; done2 = true; else key2, *parts2 = file2.gets.sub("\n",'').split(sep, -1) end + end + end + + output.puts [key, parts].flatten * sep + parts = [""] * (cols1 + cols2) + + case + when done1 + key = key2 + when done2 + key = key1 + else + key = key1 < key2 ? key1 : key2 + end + end + + output.close + end + + def self.paste(file1, file2, output, sep = "\t") + case + when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1)) + file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } ", :pipe => true) + when String === file1 + file1 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file1, :pipe => true) + end + + case + when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2)) + file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } ", :pipe => true) + when String === file2 + file2 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file2, :pipe => true) + end + + output = File.open(output, 'w') if String === output + + cols1 = nil + cols2 = nil + + done1 = false + done2 = false + + while (line1 = file1.gets) =~ /#/; end + line1.strip! + parts1 = line1.split(sep) + key1 = parts1.shift + cols1 = parts1.length + + while (line2 = file2.gets) =~ /#/; end + line2.strip! + parts2 = line2.split(sep) + key2 = parts2.shift + cols2 = parts2.length + while not (done1 or done2) + case + when key1 < key2 + output.puts [key1, parts1, [""] * cols2] * sep + if file1.eof? + done1 = true + else + line1 = file1.gets + line1.strip! + parts1 = line1.split(sep) + key1 = parts1.shift + end + when key2 < key1 + output.puts [key2, [""] * cols1, parts2] * sep + if file2.eof? + done2 = true + else + line2 = file2.gets + line2.strip! + parts2 = line2.split(sep) + key2 = parts2.shift + end + when key1 == key2 + output.puts [key1, parts1, parts2] * sep + if file1.eof? + done1 = true + else + line1 = file1.gets + line1.strip! + parts1 = line1.split(sep) + key1 = parts1.shift + end + if file2.eof? + done2 = true + else + line2 = file2.gets + line2.strip! + parts2 = line2.split(sep) + key2 = parts2.shift + end + end + end + + while not done1 + output.puts [key1, parts1, [""] * cols2] * sep + if file1.eof? + done1 = true + else + line1 = file1.gets + line1.strip! + parts1 = line1.split(sep) + key1 = parts1.shift + end + end + + while not done2 + output.puts [key2, [""] * cols1, parts2] * sep + if file2.eof? + done2 = true + else + line2 = file2.gets + line2.strip! + parts2 = line2.split(sep) + key2 = parts2.shift + end + end + + output.close + end #{{{ Attach Methods def attach_same_key(other, fields = nil) fields = other.fields - [key_field].concat(self.fields) if fields.nil? @@ -24,11 +201,11 @@ end def attach_source_key(other, source, fields = nil) fields = other.fields - [key_field].concat(self.fields) if fields.nil? - other = other.tsv unless TSV === other + other = other.tsv(:persistence => :no_create) unless TSV === other field_positions = fields.collect{|field| other.identify_field field} field_names = field_positions.collect{|pos| pos == :key ? other.key_field : other.fields[pos] } through do |key, values| source_keys = values[source] @@ -85,11 +262,15 @@ all_new_values = [] source_keys.each do |source_key| next unless other.include? source_key new_values = field_positions.collect do |pos| if pos == :key - source_key + if other.type == :double + [source_key] + else + source_key + end else other[source_key][pos] end end new_values.collect!{|v| [v]} if type == :double and not other.type == :double @@ -117,22 +298,22 @@ end #{{{ Attach Helper # May make an extra index! - def self.find_path(files, in_namespace = false) + def self.find_path(files, options = {}) + options = Misc.add_defaults options, :in_namespace => false + in_namespace = options[:in_namespace] + if in_namespace ids = [files.first.all_namespace_fields(in_namespace)] ids += files[1..-1].collect{|f| f.all_fields} else ids = files.collect{|f| f.all_fields} end id_list = [] - ids.flatten.each do |field| - end - ids.each_with_index do |list, i| break if i == ids.length - 1 match = list.select{|field| ids[i + 1].select{|f| field == f}.any? } @@ -146,25 +327,28 @@ else id_list.zip(files[0..-1]) end end - def self.build_traverse_index(files, in_namespace = false) - path = find_path(files, in_namespace) + def self.build_traverse_index(files, options = {}) + options = Misc.add_defaults options, :in_namespace => false, :persist_input => false + in_namespace = options[:in_namespace] + persist_input = options[:persist_input] + path = find_path(files, options) + return nil if path.nil? traversal_ids = path.collect{|p| p.first} Log.medium "Found Traversal: #{traversal_ids * " => "}" - current_key = files.first.all_fields.first - target = files.last.all_fields.first - target = nil current_id, current_file = path.shift - index = current_file.index :target => current_id, :fields => current_key, :persistence => false + current_key = current_file.all_fields.first + index = current_file.index :target => current_id, :fields => current_key, :persistence => persist_input + while not path.empty? current_id, current_file = path.shift current_index = current_file.index :target => current_id, :fields => index.fields.first, :persistence => true index.process 0 do |value| current_index.values_at(*value).flatten.uniq @@ -173,11 +357,14 @@ end index end - def self.find_traversal(tsv1, tsv2, in_namespace = false) + def self.find_traversal(tsv1, tsv2, options = {}) + options = Misc.add_defaults options, :in_namespace => false + in_namespace = options[:in_namespace] + identifiers1 = tsv1.identifier_files || [] identifiers2 = tsv2.identifier_files || [] identifiers1.unshift tsv1 identifiers2.unshift tsv2 @@ -186,21 +373,21 @@ files2 = [] while identifiers1.any? files1.push identifiers1.shift identifiers2.each_with_index do |e,i| files2 = identifiers2[(0..i)] - index = build_traverse_index(files1 + files2.reverse, in_namespace) + index = build_traverse_index(files1 + files2.reverse, options) return index if not index.nil? end end return nil end def attach(other, fields = nil, options = {}) - options = Misc.add_defaults options, :in_namespace => true - in_namespace = Misc.process_options options, :in_namespace + options = Misc.add_defaults options, :in_namespace => false + in_namespace = options[:in_namespace] fields = other.fields - [key_field].concat(self.fields) if fields == :all fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil? Log.high("Attaching fields:#{fields.inspect} from #{other.filename.inspect}.") case @@ -209,11 +396,11 @@ when (not in_namespace and self.fields.include?(other.key_field)) attach_source_key other, other.key_field, fields when (in_namespace and self.fields_in_namespace.include?(other.key_field)) attach_source_key other, other.key_field, fields else - index = TSV.find_traversal(self, other, in_namespace) + index = TSV.find_traversal(self, other, options) raise "Cannot traverse identifiers" if index.nil? attach_index other, index, fields end Log.medium("Attachment of fields:#{fields.inspect} from #{other.filename.inspect} finished.") end @@ -221,8 +408,24 @@ def detach(file) file_fields = file.fields.collect{|field| field.fullname} detached_fields = [] self.fields.each_with_index{|field,i| detached_fields << i if file_fields.include? field.fullname} reorder :key, detached_fields + end + + def paste(other, options = {}) + tmpfile = TmpFile.tmp_file + TSV.paste(self.to_s, other.to_s, tmpfile) + + new = TSV.new(tmpfile, options) + + new.key_field = self.key_field unless self.key_field.nil? + if self.fields and other.fields + new.fields = self.fields + other.fields + end + + FileUtils.rm tmpfile if File.exists? tmpfile + + new end end