lib/rbbt/util/tsv/attach.rb in rbbt-util-2.1.0 vs lib/rbbt/util/tsv/attach.rb in rbbt-util-3.0.2
- old
+ new
@@ -1,7 +1,184 @@
class TSV
+ def self.paste_merge(file1, file2, output, sep = "\t")
+ case
+ when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1))
+ file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true)
+ when (String === file1 or StringIO === file1)
+ file1 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1, :pipe => true)
+ end
+ case
+ when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2))
+ file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } | grep -v '^#{sep}' ", :pipe => true)
+ when (String === file2 or StringIO === file2)
+ file2 = CMD.cmd("sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file2, :pipe => true)
+ end
+
+ output = File.open(output, 'w') if String === output
+
+ cols1 = nil
+ cols2 = nil
+
+ done1 = false
+ done2 = false
+
+ key1 = key2 = nil
+ while key1.nil?
+ while (line1 = file1.gets) =~ /#/; end
+ key1, *parts1 = line1.sub("\n",'').split(sep, -1)
+ cols1 = parts1.length
+ end
+
+ while key2.nil?
+ while (line2 = file2.gets) =~ /#/; end
+ key2, *parts2 = line2.sub("\n",'').split(sep, -1)
+ cols2 = parts2.length
+ end
+
+ key = key1 < key2 ? key1 : key2
+ parts = [""] * (cols1 + cols2)
+ while not (done1 and done2)
+ while (not done1 and key1 == key)
+ parts1.each_with_index do |part, i|
+ parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part
+ end
+ key1 = nil
+ while key1.nil? and not done1
+ if file1.eof?; done1 = true; else key1, *parts1 = file1.gets.sub("\n",'').split(sep, -1) end
+ end
+ end
+ while (not done2 and key2 == key)
+ parts2.each_with_index do |part, i|
+ i += cols1
+ parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part
+ end
+ key2 = nil
+ while key2.nil? and not done2
+ if file2.eof?; done2 = true; else key2, *parts2 = file2.gets.sub("\n",'').split(sep, -1) end
+ end
+ end
+
+ output.puts [key, parts].flatten * sep
+ parts = [""] * (cols1 + cols2)
+
+ case
+ when done1
+ key = key2
+ when done2
+ key = key1
+ else
+ key = key1 < key2 ? key1 : key2
+ end
+ end
+
+ output.close
+ end
+
+ def self.paste(file1, file2, output, sep = "\t")
+ case
+ when (String === file1 and not file1.index("\n") and file1.length < 250 and File.exists?(file1))
+ file1 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file1 } ", :pipe => true)
+ when String === file1
+ file1 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file1, :pipe => true)
+ end
+
+ case
+ when (String === file2 and not file2.index("\n") and file2.length < 250 and File.exists?(file2))
+ file2 = CMD.cmd("sort -k1,1 -t'#{sep}' #{ file2 } ", :pipe => true)
+ when String === file2
+ file2 = CMD.cmd("sort -k1,1 -t'#{sep}'", :in => file2, :pipe => true)
+ end
+
+ output = File.open(output, 'w') if String === output
+
+ cols1 = nil
+ cols2 = nil
+
+ done1 = false
+ done2 = false
+
+ while (line1 = file1.gets) =~ /#/; end
+ line1.strip!
+ parts1 = line1.split(sep)
+ key1 = parts1.shift
+ cols1 = parts1.length
+
+ while (line2 = file2.gets) =~ /#/; end
+ line2.strip!
+ parts2 = line2.split(sep)
+ key2 = parts2.shift
+ cols2 = parts2.length
+ while not (done1 or done2)
+ case
+ when key1 < key2
+ output.puts [key1, parts1, [""] * cols2] * sep
+ if file1.eof?
+ done1 = true
+ else
+ line1 = file1.gets
+ line1.strip!
+ parts1 = line1.split(sep)
+ key1 = parts1.shift
+ end
+ when key2 < key1
+ output.puts [key2, [""] * cols1, parts2] * sep
+ if file2.eof?
+ done2 = true
+ else
+ line2 = file2.gets
+ line2.strip!
+ parts2 = line2.split(sep)
+ key2 = parts2.shift
+ end
+ when key1 == key2
+ output.puts [key1, parts1, parts2] * sep
+ if file1.eof?
+ done1 = true
+ else
+ line1 = file1.gets
+ line1.strip!
+ parts1 = line1.split(sep)
+ key1 = parts1.shift
+ end
+ if file2.eof?
+ done2 = true
+ else
+ line2 = file2.gets
+ line2.strip!
+ parts2 = line2.split(sep)
+ key2 = parts2.shift
+ end
+ end
+ end
+
+ while not done1
+ output.puts [key1, parts1, [""] * cols2] * sep
+ if file1.eof?
+ done1 = true
+ else
+ line1 = file1.gets
+ line1.strip!
+ parts1 = line1.split(sep)
+ key1 = parts1.shift
+ end
+ end
+
+ while not done2
+ output.puts [key2, [""] * cols1, parts2] * sep
+ if file2.eof?
+ done2 = true
+ else
+ line2 = file2.gets
+ line2.strip!
+ parts2 = line2.split(sep)
+ key2 = parts2.shift
+ end
+ end
+
+ output.close
+ end
#{{{ Attach Methods
def attach_same_key(other, fields = nil)
fields = other.fields - [key_field].concat(self.fields) if fields.nil?
@@ -24,11 +201,11 @@
end
def attach_source_key(other, source, fields = nil)
fields = other.fields - [key_field].concat(self.fields) if fields.nil?
- other = other.tsv unless TSV === other
+ other = other.tsv(:persistence => :no_create) unless TSV === other
field_positions = fields.collect{|field| other.identify_field field}
field_names = field_positions.collect{|pos| pos == :key ? other.key_field : other.fields[pos] }
through do |key, values|
source_keys = values[source]
@@ -85,11 +262,15 @@
all_new_values = []
source_keys.each do |source_key|
next unless other.include? source_key
new_values = field_positions.collect do |pos|
if pos == :key
- source_key
+ if other.type == :double
+ [source_key]
+ else
+ source_key
+ end
else
other[source_key][pos]
end
end
new_values.collect!{|v| [v]} if type == :double and not other.type == :double
@@ -117,22 +298,22 @@
end
#{{{ Attach Helper
# May make an extra index!
- def self.find_path(files, in_namespace = false)
+ def self.find_path(files, options = {})
+ options = Misc.add_defaults options, :in_namespace => false
+ in_namespace = options[:in_namespace]
+
if in_namespace
ids = [files.first.all_namespace_fields(in_namespace)]
ids += files[1..-1].collect{|f| f.all_fields}
else
ids = files.collect{|f| f.all_fields}
end
id_list = []
- ids.flatten.each do |field|
- end
-
ids.each_with_index do |list, i|
break if i == ids.length - 1
match = list.select{|field|
ids[i + 1].select{|f| field == f}.any?
}
@@ -146,25 +327,28 @@
else
id_list.zip(files[0..-1])
end
end
- def self.build_traverse_index(files, in_namespace = false)
- path = find_path(files, in_namespace)
+ def self.build_traverse_index(files, options = {})
+ options = Misc.add_defaults options, :in_namespace => false, :persist_input => false
+ in_namespace = options[:in_namespace]
+ persist_input = options[:persist_input]
+ path = find_path(files, options)
+
return nil if path.nil?
traversal_ids = path.collect{|p| p.first}
Log.medium "Found Traversal: #{traversal_ids * " => "}"
- current_key = files.first.all_fields.first
- target = files.last.all_fields.first
- target = nil
current_id, current_file = path.shift
- index = current_file.index :target => current_id, :fields => current_key, :persistence => false
+ current_key = current_file.all_fields.first
+ index = current_file.index :target => current_id, :fields => current_key, :persistence => persist_input
+
while not path.empty?
current_id, current_file = path.shift
current_index = current_file.index :target => current_id, :fields => index.fields.first, :persistence => true
index.process 0 do |value|
current_index.values_at(*value).flatten.uniq
@@ -173,11 +357,14 @@
end
index
end
- def self.find_traversal(tsv1, tsv2, in_namespace = false)
+ def self.find_traversal(tsv1, tsv2, options = {})
+ options = Misc.add_defaults options, :in_namespace => false
+ in_namespace = options[:in_namespace]
+
identifiers1 = tsv1.identifier_files || []
identifiers2 = tsv2.identifier_files || []
identifiers1.unshift tsv1
identifiers2.unshift tsv2
@@ -186,21 +373,21 @@
files2 = []
while identifiers1.any?
files1.push identifiers1.shift
identifiers2.each_with_index do |e,i|
files2 = identifiers2[(0..i)]
- index = build_traverse_index(files1 + files2.reverse, in_namespace)
+ index = build_traverse_index(files1 + files2.reverse, options)
return index if not index.nil?
end
end
return nil
end
def attach(other, fields = nil, options = {})
- options = Misc.add_defaults options, :in_namespace => true
- in_namespace = Misc.process_options options, :in_namespace
+ options = Misc.add_defaults options, :in_namespace => false
+ in_namespace = options[:in_namespace]
fields = other.fields - [key_field].concat(self.fields) if fields == :all
fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil?
Log.high("Attaching fields:#{fields.inspect} from #{other.filename.inspect}.")
case
@@ -209,11 +396,11 @@
when (not in_namespace and self.fields.include?(other.key_field))
attach_source_key other, other.key_field, fields
when (in_namespace and self.fields_in_namespace.include?(other.key_field))
attach_source_key other, other.key_field, fields
else
- index = TSV.find_traversal(self, other, in_namespace)
+ index = TSV.find_traversal(self, other, options)
raise "Cannot traverse identifiers" if index.nil?
attach_index other, index, fields
end
Log.medium("Attachment of fields:#{fields.inspect} from #{other.filename.inspect} finished.")
end
@@ -221,8 +408,24 @@
def detach(file)
file_fields = file.fields.collect{|field| field.fullname}
detached_fields = []
self.fields.each_with_index{|field,i| detached_fields << i if file_fields.include? field.fullname}
reorder :key, detached_fields
+ end
+
+ def paste(other, options = {})
+ tmpfile = TmpFile.tmp_file
+ TSV.paste(self.to_s, other.to_s, tmpfile)
+
+ new = TSV.new(tmpfile, options)
+
+ new.key_field = self.key_field unless self.key_field.nil?
+ if self.fields and other.fields
+ new.fields = self.fields + other.fields
+ end
+
+ FileUtils.rm tmpfile if File.exists? tmpfile
+
+ new
end
end