lib/rbbt/tsv/attach.rb in rbbt-util-5.6.0 vs lib/rbbt/tsv/attach.rb in rbbt-util-5.6.1

- old
+ new

@@ -1,29 +1,42 @@ require 'rbbt/tsv' require 'rbbt/tsv/attach/util' module TSV # Merge columns from different rows of a file - def self.merge_row_fields(input, output, sep = "\t") + def self.merge_row_fields(input, output, options = {}) + options = Misc.add_defaults options, :sep => "\t" + key_field, fields = Misc.process_options options, :key_field, :fields + sep = options[:sep] + is = case when (String === input and not input.index("\n") and input.length < 250 and File.exists?(input)) CMD.cmd("sort -k1,1 -t'#{sep}' #{ input } | grep -v '^#{sep}' ", :pipe => true) when (String === input or StringIO === input) CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => input, :pipe => true) else input end + + if key_field.nil? or fields.nil? + parser = TSV::Parser.new(is, options.dup) + fields ||= parser.fields + key_field ||= parser.key_field + line = parser.first_line + else + line = is.gets + end current_key = nil current_parts = [] done = false Open.write(output) do |os| + os.puts TSV.header_lines(key_field, fields, options) - done = is.eof? - while not done - key, *parts = is.gets.sub("\n",'').split(sep, -1) + while line + key, *parts = line.sub("\n",'').split(sep, -1) current_key ||= key case when key.nil? when current_key == key parts.each_with_index do |part,i| @@ -37,23 +50,27 @@ os.puts [current_key, current_parts].flatten * sep current_key = key current_parts = parts end - done = is.eof? + line = is.gets end os.puts [current_key, current_parts].flatten * sep unless current_key.nil? end end # Merge two files with the same keys and different fields - def self.merge_different_fields(file1, file2, output, sep = "\t", monitor = false) + def self.merge_different_fields(file1, file2, output, options = {}) + options = Misc.add_defaults options, :sep => "\t" + monitor, key_field, fields = Misc.process_options options, :monitor, :key_field, :fields + sep = options[:sep] || "\t" + case when (String === file1 and not file1 =~ /\n/ and file1.length < 250 and File.exists?(file1)) - size = CMD.cmd("wc -l '#{file1}'").read.to_f if monitor + size = CMD.cmd("wc -c '#{file1}'").read.to_f if monitor file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true) when (String === file1 or StringIO === file1) size = file1.length if monitor file1 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1, :pipe => true) when TSV === file1 @@ -78,23 +95,31 @@ done1 = false done2 = false key1 = key2 = nil while key1.nil? - while (line1 = file1.gets) =~ /#/; end + while (line1 = file1.gets) =~ /^#/ + key_field1, *fields1 = line1.strip.sub('#','').split(sep) + end key1, *parts1 = line1.sub("\n",'').split(sep, -1) cols1 = parts1.length end while key2.nil? - while (line2 = file2.gets) =~ /#/; end + while (line2 = file2.gets) =~ /^#/ + key_field2, *fields2 = line2.strip.sub('#','').split(sep) + end key2, *parts2 = line2.sub("\n",'').split(sep, -1) cols2 = parts2.length end progress_monitor = Progress::Bar.new(size, 0, 100, "Merging fields") if monitor + entry_hash = options + entry_hash.delete :sep if entry_hash[:sep] == "\t" + output.puts TSV.header_lines key_field1, fields1 + fields2, entry_hash if key_field1 and fields1 and fields2 + key = key1 < key2 ? key1 : key2 parts = [""] * (cols1 + cols2) while not (done1 and done2) while (not done1 and key1 == key) parts1.each_with_index do |part, i| @@ -187,10 +212,10 @@ reorder :key, detached_fields end def merge_different_fields(other, options = {}) TmpFile.with_file do |output| - TSV.merge_different_fields(self, other, output, options[:sep] || "\t") + TSV.merge_different_fields(self, other, output, options) tsv = TSV.open output, options tsv.key_field = self.key_field unless self.key_field.nil? tsv.fields = self.fields + other.fields unless self.fields.nil? or other.fields.nil? tsv end