lib/rbbt/util/tsv/attach.rb in rbbt-util-3.0.3 vs lib/rbbt/util/tsv/attach.rb in rbbt-util-3.1.0

- old
+ new

@@ -45,17 +45,21 @@ case when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1)) file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true) when (String === file1 or StringIO === file1) file1 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1, :pipe => true) + when TSV === file1 + file1 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1.to_s(:sort, true), :pipe => true) end case when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2)) file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } | grep -v '^#{sep}' ", :pipe => true) when (String === file2 or StringIO === file2) file2 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file2, :pipe => true) + when TSV === file2 + file2 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file2.to_s(:sort, true), :pipe => true) end output = File.open(output, 'w') if String === output cols1 = nil @@ -113,114 +117,10 @@ end end output.close end - - def self.paste(file1, file2, output, sep = "\t") - case - when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1)) - file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } ", :pipe => true) - when String === file1 - file1 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file1, :pipe => true) - end - - case - when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2)) - file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } ", :pipe => true) - when String === file2 - file2 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file2, :pipe => true) - end - - output = File.open(output, 'w') if String === output - - cols1 = nil - cols2 = nil - - done1 = false - done2 = false - - while (line1 = file1.gets) =~ /#/; end - line1.strip! - parts1 = line1.split(sep) - key1 = parts1.shift - cols1 = parts1.length - - while (line2 = file2.gets) =~ /#/; end - line2.strip! - parts2 = line2.split(sep) - key2 = parts2.shift - cols2 = parts2.length - while not (done1 or done2) - case - when key1 < key2 - output.puts [key1, parts1, [""] * cols2] * sep - if file1.eof? - done1 = true - else - line1 = file1.gets - line1.strip! - parts1 = line1.split(sep) - key1 = parts1.shift - end - when key2 < key1 - output.puts [key2, [""] * cols1, parts2] * sep - if file2.eof? - done2 = true - else - line2 = file2.gets - line2.strip! - parts2 = line2.split(sep) - key2 = parts2.shift - end - when key1 == key2 - output.puts [key1, parts1, parts2] * sep - if file1.eof? - done1 = true - else - line1 = file1.gets - line1.strip! - parts1 = line1.split(sep) - key1 = parts1.shift - end - if file2.eof? - done2 = true - else - line2 = file2.gets - line2.strip! - parts2 = line2.split(sep) - key2 = parts2.shift - end - end - end - - while not done1 - output.puts [key1, parts1, [""] * cols2] * sep - if file1.eof? - done1 = true - else - line1 = file1.gets - line1.strip! - parts1 = line1.split(sep) - key1 = parts1.shift - end - end - - while not done2 - output.puts [key2, [""] * cols1, parts2] * sep - if file2.eof? - done2 = true - else - line2 = file2.gets - line2.strip! - parts2 = line2.split(sep) - key2 = parts2.shift - end - end - - output.close - end #{{{ Attach Methods def attach_same_key(other, fields = nil) fields = other.fields - [key_field].concat(self.fields) if fields.nil? @@ -294,10 +194,12 @@ other = other.tsv unless TSV === other field_positions = fields.collect{|field| other.identify_field field} field_names = field_positions.collect{|pos| pos == :key ? other.key_field : other.fields[pos] } + + length = self.fields.length through do |key, values| source_keys = index[key] if source_keys.nil? or source_keys.empty? all_new_values = [] else @@ -313,29 +215,40 @@ end else other[source_key][pos] end end - new_values.collect!{|v| [v]} if type == :double and not other.type == :double + new_values.collect!{|v| v.nil? ? [[]] : [v]} if type == :double and not other.type == :double new_values.collect!{|v| v.nil? ? nil : v.first} if not type == :double and other.type == :double all_new_values << new_values end end if all_new_values.empty? if type == :double - self[key] = self[key].concat [[]] * field_positions.length + all_new_values = [[[]] * field_positions.length] else - self[key] = self[key].concat [""] * field_positions.length + all_new_values = [[""] * field_positions.length] end + end + + current = self[key] + + if current.length > length + all_new_values << current.slice!(length..current.length - 1) + end + + if type == :double + all_new_values = TSV.zip_fields(all_new_values).collect{|l| l.flatten} else - if type == :double - self[key] = self[key].concat TSV.zip_fields(all_new_values).collect{|l| l.flatten} - else - self[key] = self[key].concat all_new_values.first - end + all_new_values = all_new_values.first end + + current += all_new_values + + self[key] = current + end self.fields = self.fields.concat field_names end @@ -383,36 +296,37 @@ traversal_ids = path.collect{|p| p.first} Log.medium "Found Traversal: #{traversal_ids * " => "}" data_key, data_file = path.shift - if data_key == data_file.key_field - Log.debug "Data index not required '#{data_file.key_field}' => '#{data_key}'" - data_index = nil - else - Log.debug "Data index required" - data_index = data_file.index :target => data_key, :fields => data_file.key_field, :persistence => false - end + data_index = if data_key == data_file.key_field + Log.debug "Data index not required '#{data_file.key_field}' => '#{data_key}'" + nil + else + Log.debug "Data index required" + data_file.index :target => data_key, :fields => data_file.key_field, :persistence => false + end current_index = data_index current_key = data_key while not path.empty? next_key, next_file = path.shift if current_index.nil? current_index = next_file.index :target => next_key, :fields => current_key, :persistence => persist_input else next_index = next_file.index :target => next_key, :fields => current_key, :persistence => persist_input - current_index.process current_index.fields.first do |key, values, values| + current_index.process current_index.fields.first do |values| if values.nil? nil else next_index.values_at(*values).flatten.collect end end current_index.fields = [next_key] end + current_key = next_key end current_index end @@ -468,10 +382,12 @@ index = TSV.find_traversal(self, other, options) raise "Cannot traverse identifiers" if index.nil? attach_index other, index, fields end Log.medium("Attachment of fields:#{fields.inspect} from #{other.filename.inspect} finished.") + + self end def detach(file) file_fields = file.fields.collect{|field| field.fullname} detached_fields = [] @@ -487,12 +403,23 @@ new.key_field = self.key_field unless self.key_field.nil? if self.fields and other.fields new.fields = self.fields + other.fields end - + FileUtils.rm tmpfile if File.exists? tmpfile new end + + def paste(other, options = {}) + TmpFile.with_file do |output| + TSV.paste_merge(self, other, output, options[:sep] || "\t") + TSV.new output, options + end + end + + def self.fast_paste(files, delim = "$") + CMD.cmd("paste #{ files.collect{|f| "'#{f}'"} * " "} -d'#{delim}' |sed 's/#{delim}[^\\t]*//g'", :pipe => true) + end end