class TSV def self.paste_merge(file1, file2, output, sep = "\t") case when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1)) file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true) when (String === file1 or StringIO === file1) file1 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1, :pipe => true) end case when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2)) file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } | grep -v '^#{sep}' ", :pipe => true) when (String === file2 or StringIO === file2) file2 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file2, :pipe => true) end output = File.open(output, 'w') if String === output cols1 = nil cols2 = nil done1 = false done2 = false key1 = key2 = nil while key1.nil? while (line1 = file1.gets) =~ /#/; end key1, *parts1 = line1.sub("\n",'').split(sep, -1) cols1 = parts1.length end while key2.nil? while (line2 = file2.gets) =~ /#/; end key2, *parts2 = line2.sub("\n",'').split(sep, -1) cols2 = parts2.length end key = key1 < key2 ? key1 : key2 parts = [""] * (cols1 + cols2) while not (done1 and done2) while (not done1 and key1 == key) parts1.each_with_index do |part, i| parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part end key1 = nil while key1.nil? and not done1 if file1.eof?; done1 = true; else key1, *parts1 = file1.gets.sub("\n",'').split(sep, -1) end end end while (not done2 and key2 == key) parts2.each_with_index do |part, i| i += cols1 parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part end key2 = nil while key2.nil? and not done2 if file2.eof?; done2 = true; else key2, *parts2 = file2.gets.sub("\n",'').split(sep, -1) end end end output.puts [key, parts].flatten * sep parts = [""] * (cols1 + cols2) case when done1 key = key2 when done2 key = key1 else key = key1 < key2 ? key1 : key2 end end output.close end def self.paste(file1, file2, output, sep = "\t") case when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1)) file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } ", :pipe => true) when String === file1 file1 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file1, :pipe => true) end case when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2)) file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } ", :pipe => true) when String === file2 file2 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file2, :pipe => true) end output = File.open(output, 'w') if String === output cols1 = nil cols2 = nil done1 = false done2 = false while (line1 = file1.gets) =~ /#/; end line1.strip! parts1 = line1.split(sep) key1 = parts1.shift cols1 = parts1.length while (line2 = file2.gets) =~ /#/; end line2.strip! parts2 = line2.split(sep) key2 = parts2.shift cols2 = parts2.length while not (done1 or done2) case when key1 < key2 output.puts [key1, parts1, [""] * cols2] * sep if file1.eof? done1 = true else line1 = file1.gets line1.strip! parts1 = line1.split(sep) key1 = parts1.shift end when key2 < key1 output.puts [key2, [""] * cols1, parts2] * sep if file2.eof? done2 = true else line2 = file2.gets line2.strip! parts2 = line2.split(sep) key2 = parts2.shift end when key1 == key2 output.puts [key1, parts1, parts2] * sep if file1.eof? done1 = true else line1 = file1.gets line1.strip! parts1 = line1.split(sep) key1 = parts1.shift end if file2.eof? done2 = true else line2 = file2.gets line2.strip! parts2 = line2.split(sep) key2 = parts2.shift end end end while not done1 output.puts [key1, parts1, [""] * cols2] * sep if file1.eof? done1 = true else line1 = file1.gets line1.strip! parts1 = line1.split(sep) key1 = parts1.shift end end while not done2 output.puts [key2, [""] * cols1, parts2] * sep if file2.eof? done2 = true else line2 = file2.gets line2.strip! parts2 = line2.split(sep) key2 = parts2.shift end end output.close end #{{{ Attach Methods def attach_same_key(other, fields = nil) fields = other.fields - [key_field].concat(self.fields) if fields.nil? through do |key, values| if other.include? key new_values = other[key].values_at *fields new_values.collect!{|v| [v]} if type == :double and not other.type == :double new_values.collect!{|v| v.first} if not type == :double and other.type == :double self[key] = self[key].concat new_values else if type == :double self[key] = self[key].concat [[]] * fields.length else self[key] = self[key].concat [""] * fields.length end end end self.fields = self.fields.concat other.fields.values_at *fields end def attach_source_key(other, source, fields = nil) fields = other.fields - [key_field].concat(self.fields) if fields.nil? other = other.tsv(:persistence => :no_create) unless TSV === other field_positions = fields.collect{|field| other.identify_field field} field_names = field_positions.collect{|pos| pos == :key ? other.key_field : other.fields[pos] } through do |key, values| source_keys = values[source] if source_keys.nil? or source_keys.empty? all_new_values = [] else all_new_values = [] source_keys.each do |source_key| next unless other.include? source_key new_values = field_positions.collect do |pos| if pos == :key source_key else other[source_key][pos] end end new_values.collect!{|v| [v]} if type == :double and not other.type == :double new_values.collect!{|v| v.first} if not type == :double and other.type == :double all_new_values << new_values end end if all_new_values.empty? if type == :double self[key] = self[key].concat [[]] * field_positions.length else self[key] = self[key].concat [""] * field_positions.length end else if type == :double self[key] = self[key].concat TSV.zip_fields(all_new_values).collect{|l| l.flatten} else self[key] = self[key].concat all_new_values.first end end end self.fields = self.fields.concat field_names end def attach_index(other, index, fields = nil) fields = other.fields - [key_field].concat(self.fields) if fields.nil? other = other.tsv unless TSV === other field_positions = fields.collect{|field| other.identify_field field} field_names = field_positions.collect{|pos| pos == :key ? other.key_field : other.fields[pos] } through do |key, values| source_keys = index[key] if source_keys.nil? or source_keys.empty? all_new_values = [] else all_new_values = [] source_keys.each do |source_key| next unless other.include? source_key new_values = field_positions.collect do |pos| if pos == :key if other.type == :double [source_key] else source_key end else other[source_key][pos] end end new_values.collect!{|v| [v]} if type == :double and not other.type == :double new_values.collect!{|v| v.first} if not type == :double and other.type == :double all_new_values << new_values end end if all_new_values.empty? if type == :double self[key] = self[key].concat [[]] * field_positions.length else self[key] = self[key].concat [""] * field_positions.length end else if type == :double self[key] = self[key].concat TSV.zip_fields(all_new_values).collect{|l| l.flatten} else self[key] = self[key].concat all_new_values.first end end end self.fields = self.fields.concat field_names end #{{{ Attach Helper # May make an extra index! def self.find_path(files, options = {}) options = Misc.add_defaults options, :in_namespace => false in_namespace = options[:in_namespace] if in_namespace ids = [files.first.all_namespace_fields(in_namespace)] ids += files[1..-1].collect{|f| f.all_fields} else ids = files.collect{|f| f.all_fields} end id_list = [] ids.each_with_index do |list, i| break if i == ids.length - 1 match = list.select{|field| ids[i + 1].select{|f| field == f}.any? } return nil if match.empty? id_list << match.first end if id_list.last.first != files.last.all_fields.first id_list << files.last.all_fields.first id_list.zip(files) else id_list.zip(files[0..-1]) end end def self.build_traverse_index(files, options = {}) options = Misc.add_defaults options, :in_namespace => false, :persist_input => false in_namespace = options[:in_namespace] persist_input = options[:persist_input] path = find_path(files, options) return nil if path.nil? traversal_ids = path.collect{|p| p.first} Log.medium "Found Traversal: #{traversal_ids * " => "}" current_id, current_file = path.shift current_key = current_file.all_fields.first index = current_file.index :target => current_id, :fields => current_key, :persistence => persist_input while not path.empty? current_id, current_file = path.shift current_index = current_file.index :target => current_id, :fields => index.fields.first, :persistence => true index.process 0 do |value| current_index.values_at(*value).flatten.uniq end index.fields = current_index.fields end index end def self.find_traversal(tsv1, tsv2, options = {}) options = Misc.add_defaults options, :in_namespace => false in_namespace = options[:in_namespace] identifiers1 = tsv1.identifier_files || [] identifiers2 = tsv2.identifier_files || [] identifiers1.unshift tsv1 identifiers2.unshift tsv2 files1 = [] files2 = [] while identifiers1.any? files1.push identifiers1.shift identifiers2.each_with_index do |e,i| files2 = identifiers2[(0..i)] index = build_traverse_index(files1 + files2.reverse, options) return index if not index.nil? end end return nil end def attach(other, fields = nil, options = {}) options = Misc.add_defaults options, :in_namespace => false in_namespace = options[:in_namespace] fields = other.fields - [key_field].concat(self.fields) if fields == :all fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil? Log.high("Attaching fields:#{fields.inspect} from #{other.filename.inspect}.") case when key_field == other.key_field attach_same_key other, fields when (not in_namespace and self.fields.include?(other.key_field)) attach_source_key other, other.key_field, fields when (in_namespace and self.fields_in_namespace.include?(other.key_field)) attach_source_key other, other.key_field, fields else index = TSV.find_traversal(self, other, options) raise "Cannot traverse identifiers" if index.nil? attach_index other, index, fields end Log.medium("Attachment of fields:#{fields.inspect} from #{other.filename.inspect} finished.") end def detach(file) file_fields = file.fields.collect{|field| field.fullname} detached_fields = [] self.fields.each_with_index{|field,i| detached_fields << i if file_fields.include? field.fullname} reorder :key, detached_fields end def paste(other, options = {}) tmpfile = TmpFile.tmp_file TSV.paste(self.to_s, other.to_s, tmpfile) new = TSV.new(tmpfile, options) new.key_field = self.key_field unless self.key_field.nil? if self.fields and other.fields new.fields = self.fields + other.fields end FileUtils.rm tmpfile if File.exists? tmpfile new end end