lib/rbbt/util/tsv.rb in rbbt-util-1.0.1 vs lib/rbbt/util/tsv.rb in rbbt-util-1.1.0

- old
+ new

@@ -1,9 +1,10 @@ require 'rbbt/util/misc' require 'rbbt/util/open' require 'rbbt/util/tc_hash' require 'rbbt/util/tmpfile' +require 'rbbt/util/log' require 'digest' require 'fileutils' def add_defaults(options, defaults = {}) new_options = options.dup @@ -14,10 +15,17 @@ end class TSV class FieldNotFoundError < StandardError;end + module Field + def ==(string) + return false unless String === string + self.sub(/#.*/,'').casecmp(string.sub(/#.*/,'')) == 0 + end + end + #{{{ Persistence PersistenceHash = TCHash CACHEDIR="/tmp/tsv_persistent_cache" @@ -34,19 +42,12 @@ def self.get_persistence_file(file, prefix, options = {}) File.join(CACHEDIR, prefix.gsub(/\s/,'_').gsub(/\//,'>') + Digest::MD5.hexdigest([file, options].inspect)) end - @debug = ENV['TSV_DEBUG'] == "true" - def self.log(message) - STDERR.puts message if @debug == true - end + #{{{ Headers and Field Stuff - def self.debug=(value) - @debug = value - end - def self.headers(file, options = {}) if file =~ /(.*)#(.*)/ and File.exists? $1 options.merge! Misc.string2hash $2 file = $1 end @@ -61,365 +62,12 @@ else nil end end - #{{{ Accesor Methods - - def keys - @data.keys - end - - def values - @data.values - end - - def size - @data.size - end - - # Write - - def []=(key, value) - key = key.downcase if @case_insensitive - @data[key] = value - end - - - def merge!(new_data) - new_data.each do |key, value| - self[key] = value - end - end - - # Read - - def follow(value) - if String === value && value =~ /__Ref:(.*)/ - return self[$1] - else - value = NamedArray.name value, fields if Array === value and fields - value - end - end - def [](key) - if Array === key - return @data[key] if @data[key] != nil - key.each{|k| v = self[k]; return v unless v.nil?} - return nil - end - - key = key.downcase if @case_insensitive - follow @data[key] - end - - def values_at(*keys) - keys.collect{|k| - self[k] - } - end - - def each(&block) - @data.each do |key, value| - block.call(key, follow(value)) - end - end - - def collect - if block_given? - @data.collect do |key, value| - value = follow(value) - key, values = yield key, value - end - else - @data.collect do |key, value| - [key, follow(value)] - end - end - end - - def sort(&block) - collect.sort(&block).collect{|p| - key, value = p - value = NamedArray.name value, fields if fields - [key, value] - } - end - - def sort_by(&block) - collect.sort_by &block - end - - #{{{ Parsing - - def self.parse_fields(io, delimiter = "\t") - return [] if io.nil? - fields = io.split(delimiter, -1) - fields - end - - def self.zip_fields(list, fields = nil) - return [] if list.nil? || list.empty? - fields ||= list.fields if list.respond_to? :fields - zipped = list[0].zip(*list[1..-1]) - zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields - zipped - end - - def self.parse(data, file, options = {}) - - # Prepare options - options = add_defaults options, - :sep => "\t", - :sep2 => "|", - :native => 0, - :extra => nil, - :fix => nil, - :exclude => nil, - :select => nil, - :grep => nil, - :single => false, - :unique => false, - :flatten => false, - :overwrite => false, - :keep_empty => true, - :case_insensitive => false, - :header_hash => '#' , - :persistence_file => nil - - options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra]) - options[:flatten] = true if options[:single] - - - - #{{{ Process first line - - line = file.gets - raise "Empty content" if line.nil? - line.chomp! - - if line =~ /^#{options[:header_hash]}/ - header_fields = parse_fields(line, options[:sep]) - header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character - line = file.gets - else - header_fields = nil - end - - id_pos = Misc.field_position(header_fields, options[:native]) - - if options[:extra].nil? - extra_pos = nil - max_cols = 0 - else - extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) } - end - - #{{{ Process rest - while line do - line.chomp! - - line = options[:fix].call line if options[:fix] - - # Select and fix lines - if (options[:exclude] and options[:exclude].call(line)) or - (options[:select] and not options[:select].call(line)) - line = file.gets - next - end - - ### Process line - - # Chunk fields - parts = parse_fields(line, options[:sep]) - - # Get next line - line = file.gets - - # Get id field - next if parts[id_pos].nil? || parts[id_pos].empty? - ids = parse_fields(parts[id_pos], options[:sep2]) - ids.collect!{|id| id.downcase } if options[:case_insensitive] - - # Get extra fields - - if options[:extra].nil? and not (options[:flatten] or options[:single]) - extra = parts - extra.delete_at(id_pos) - max_cols = extra.size if extra.size > (max_cols || 0) - else - if extra_pos.nil? - extra = parts - extra.delete_at id_pos - else - extra = parts.values_at(*extra_pos) - end - end - - extra.collect!{|value| parse_fields(value, options[:sep2])} - extra.collect!{|values| values.first} if options[:unique] - extra.flatten! if options[:flatten] - extra = extra.first if options[:single] - - if options[:overwrite] - main_entry = ids.shift - ids.each do |id| - data[id] = "__Ref:#{main_entry}" - end - - data[main_entry] = extra - else - main_entry = ids.shift - ids.each do |id| - data[id] = "__Ref:#{main_entry}" - end - - case - when (options[:single] or options[:unique]) - data[main_entry] ||= extra - when options[:flatten] - if PersistenceHash === data - data[main_entry] = (data[main_entry] || []).concat extra - else - data[main_entry] ||= [] - data[main_entry].concat extra - end - else - entry = data[main_entry] || [] - while entry =~ /__Ref:(.*)/ do - entry = data[$1] - end - - extra.each_with_index do |fields, i| - if fields.empty? - next unless options[:keep_empty] - fields = [""] - end - entry[i] ||= [] - entry[i] = entry[i].concat fields - end - - data[main_entry] = entry - end - end - end - - if options[:keep_empty] and not max_cols.nil? - data.each do |key,values| - new_values = values - max_cols.times do |i| - new_values[i] ||= [""] - end - data[key] = new_values - end - end - - - # Save header information - key_field = nil - fields = nil - if header_fields && header_fields.any? - key_field = header_fields[id_pos] - if extra_pos.nil? - fields = header_fields - fields.delete_at(id_pos) - else - fields = header_fields.values_at(*extra_pos) - end - end - - data.read if PersistenceHash === data - - [key_field, fields] - end - - attr_accessor :data, :key_field, :fields, :list, :case_insensitive, :filename - def initialize(file = {}, options = {}) - @case_insensitive = options[:case_insensitive] == true - @list = ! (options[:flatten] == true || options[:single] == true || options[:unique] == true) - - case - when TSV === file - @filename = file.filename - @data = file.data - @key_field = file.key_field - @fields = file.fields - @case_insensitive = file.case_insensitive - @list = file.is_list - return self - when (Hash === file or PersistenceHash === file) - @filename = "Hash:" + Digest::MD5.hexdigest(file.inspect) - @data = file - return self - when File === file - @filename = File.expand_path file.path - when String === file && File.exists?(file) - @filename = File.expand_path file - file = Open.open(file) - when StringIO - else - raise "File #{file} not found" - end - - if options[:persistence] - options.delete :persistence - persistence_file = TSV.get_persistence_file @filename, "file:#{ @filename }:", options - - if File.exists? persistence_file - TSV.log "Loading Persistence for #{ @filename } in #{persistence_file}" - @data = PersistenceHash.get(persistence_file, false) - @key_field = @data.key_field - @fields = @data.fields - else - @data = PersistenceHash.get(persistence_file, true) - file = Open.grep(file, options[:grep]) if options[:grep] - - TSV.log "Persistent Parsing for #{ @filename } in #{persistence_file}" - @key_field, @fields = TSV.parse(@data, file, options.merge(:persistence_file => persistence_file)) - @data.key_field = @key_field - @data.fields = @fields - @data.read - end - else - TSV.log "Non-persistent parsing for #{ @filename }" - @data = {} - file = Open.grep(file, options[:grep]) if options[:grep] - @key_field, @fields = TSV.parse(@data, file, options) - end - - file.close - @case_insensitive = options[:case_insensitive] == true - end - - - def to_s - str = "" - - if fields - str << "#" << key_field << "\t" << fields * "\t" << "\n" - end - - each do |key, values| - case - when values.nil? - str << key.dup << "\n" - when (not Array === values) - str << key.dup << "\t" << values.to_s << "\n" - when Array === values.first - str << key.dup << "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n" - else - str << key.dup << "\t" << values * "\t" << "\n" - end - end - - str - end - - #{{{ New - def self.fields_include(key_field, fields, field) - return true if field == key_field or fields.include? field + return true if key_field == field or fields.include? field return false end def self.field_positions(key_field, fields, *selected) selected.collect do |sel| @@ -447,22 +95,28 @@ return nil if fields.nil? return nil if positions.nil? or positions == [nil] (fields + [key_field]).values_at(*positions) end + #{{{ Iteration, Merging, etc def through(new_key_field = nil, new_fields = nil, &block) new_key_position = (field_positions(new_key_field) || [-1]).first + new_fields = [new_fields] if String === new_fields if new_key_position == -1 if new_fields.nil? or new_fields == fields each &block return [key_field, fields] else new_field_positions = field_positions(*new_fields) each do |key, values| - yield key, values.values_at(*new_field_positions) + if values.nil? + yield key, nil + else + yield key, values.values_at(*new_field_positions) + end end return [key_field, fields_at(*new_field_positions)] end else @@ -541,10 +195,55 @@ def slice(new_fields, options = {}) reorder(:main, new_fields) end + def add_field(name = nil) + each do |key, values| + self[key] = values << yield(key, values) + end + + fields << name if list + if PersistenceHash === @data + @data.fields = fields + end + end + + def select(method) + new = TSV.new({}) + new.key_field = key_field + new.fields = fields.dup + + case + when Array === method + through do |key, values| + new[key] = values if ([key,values].flatten & method).any? + end + when Regexp === method + through do |key, values| + new[key] = values if [key,values].flatten.select{|v| v =~ method}.any? + end + when Hash === method + key = method.keys.first + method = method.values.first + case + when (Array === method and (:main == key or key_field == key)) + method.each{|item| if values = self[item]; then new[item] = values; end} + when Array === method + through :main, key do |key, values| + new[key] = values if (values.flatten & method).any? + end + when Regexp === method + through :main, key do |key, values| + new[key] = values if values.flatten.select{|v| v =~ method}.any? + end + end + end + + new + end + def index(options = {}) options = Misc.add_defaults options, :order => false if options[:persistence] and ! options[:persistence_file] options[:persistence_file] = TSV.get_persistence_file(filename, "index:#{ filename }_#{options[:field]}:", options) @@ -766,11 +465,11 @@ end self.fields = self.fields + new_fields unless nofieldinfo end - #{{{ Helpers + #{{{ Helpers def self.index(file, options = {}) opt_data = options.dup opt_index = options.dup opt_data.delete :field @@ -780,14 +479,14 @@ opt_data[:persistence] = true if options[:data_persistence] opt_index.merge! :persistence_file => get_persistence_file(file, "index:#{ file }_#{options[:field]}:", opt_index) if options[:persistence] if ! opt_index[:persistence_file].nil? && File.exists?(opt_index[:persistence_file]) - TSV.log "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}" + Log.low "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}" TSV.new(PersistenceHash.get(opt_index[:persistence_file], false), opt_index) else - TSV.log "Creating index for #{ file }: #{opt_index[:persistence_file]}" + Log.low "Creating index for #{ file }: #{opt_index[:persistence_file]}" data = TSV.new(file, opt_data) data.index(opt_index) end end @@ -799,6 +498,416 @@ end TSV.new(file, options) end + #{{{ Accesor Methods + + def keys + @data.keys + end + + def values + @data.values + end + + def size + @data.size + end + + # Write + + def []=(key, value) + key = key.downcase if @case_insensitive + @data[key] = value + end + + + def merge!(new_data) + new_data.each do |key, value| + self[key] = value + end + end + + # Read + + def follow(value) + if String === value && value =~ /__Ref:(.*)/ + return self[$1] + else + value = NamedArray.name value, fields if Array === value and fields + value + end + end + + def [](key) + if Array === key + return @data[key] if @data[key] != nil + key.each{|k| v = self[k]; return v unless v.nil?} + return nil + end + + key = key.downcase if @case_insensitive + follow @data[key] + end + + def values_at(*keys) + keys.collect{|k| + self[k] + } + end + + def each(&block) + @data.each do |key, value| + block.call(key, follow(value)) + end + end + + def collect + if block_given? + @data.collect do |key, value| + value = follow(value) + key, values = yield key, value + end + else + @data.collect do |key, value| + [key, follow(value)] + end + end + end + + def sort(&block) + collect.sort(&block).collect{|p| + key, value = p + value = NamedArray.name value, fields if fields + [key, value] + } + end + + def sort_by(&block) + collect.sort_by &block + end + + def to_s + str = "" + + if fields + str << "#" << key_field << "\t" << fields * "\t" << "\n" + end + + each do |key, values| + case + when values.nil? + str << key.dup << "\n" + when (not Array === values) + str << key.dup << "\t" << values.to_s << "\n" + when Array === values.first + str << key.dup << "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n" + else + str << key.dup << "\t" << values * "\t" << "\n" + end + end + + str + end + + #{{{ Parsing + + def self.parse_fields(io, delimiter = "\t") + return [] if io.nil? + fields = io.split(delimiter, -1) + fields + end + + def self.zip_fields(list, fields = nil) + return [] if list.nil? || list.empty? + fields ||= list.fields if list.respond_to? :fields + zipped = list[0].zip(*list[1..-1]) + zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields + zipped + end + + def self.parse(data, file, options = {}) + + # Prepare options + options = add_defaults options, + :sep => "\t", + :sep2 => "|", + :native => 0, + :extra => nil, + :fix => nil, + :exclude => nil, + :select => nil, + :grep => nil, + :single => false, + :unique => false, + :flatten => false, + :overwrite => false, + :keep_empty => true, + :case_insensitive => false, + :header_hash => '#' , + :persistence_file => nil + + options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra]) + options[:flatten] = true if options[:single] + + + + #{{{ Process first line + + line = file.gets + raise "Empty content" if line.nil? + line.chomp! + + if line =~ /^#{options[:header_hash]}/ + header_fields = parse_fields(line, options[:sep]) + header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character + line = file.gets + else + header_fields = nil + end + + id_pos = Misc.field_position(header_fields, options[:native]) + + if options[:extra].nil? + extra_pos = nil + max_cols = 0 + else + extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) } + end + + #{{{ Process rest + while line do + line.chomp! + + line = options[:fix].call line if options[:fix] + + # Select and fix lines + if (options[:exclude] and options[:exclude].call(line)) or + (options[:select] and not options[:select].call(line)) + line = file.gets + next + end + + ### Process line + + # Chunk fields + parts = parse_fields(line, options[:sep]) + + # Get next line + line = file.gets + + # Get id field + next if parts[id_pos].nil? || parts[id_pos].empty? + ids = parse_fields(parts[id_pos], options[:sep2]) + ids.collect!{|id| id.downcase } if options[:case_insensitive] + + # Get extra fields + + if options[:extra].nil? and not (options[:flatten] or options[:single]) + extra = parts + extra.delete_at(id_pos) + max_cols = extra.size if extra.size > (max_cols || 0) + else + if extra_pos.nil? + extra = parts + extra.delete_at id_pos + else + extra = parts.values_at(*extra_pos) + end + end + + extra.collect!{|value| parse_fields(value, options[:sep2])} + extra.collect!{|values| values.first} if options[:unique] + extra.flatten! if options[:flatten] + extra = extra.first if options[:single] + + if options[:overwrite] + main_entry = ids.shift + ids.each do |id| + data[id] = "__Ref:#{main_entry}" + end + + data[main_entry] = extra + else + main_entry = ids.shift + ids.each do |id| + data[id] = "__Ref:#{main_entry}" + end + + case + when (options[:single] or options[:unique]) + data[main_entry] ||= extra + when options[:flatten] + if PersistenceHash === data + data[main_entry] = (data[main_entry] || []).concat extra + else + data[main_entry] ||= [] + data[main_entry].concat extra + end + else + entry = data[main_entry] || [] + while entry =~ /__Ref:(.*)/ do + entry = data[$1] + end + + extra.each_with_index do |fields, i| + if fields.empty? + next unless options[:keep_empty] + fields = [""] + end + entry[i] ||= [] + entry[i] = entry[i].concat fields + end + + data[main_entry] = entry + end + end + end + + if options[:keep_empty] and not max_cols.nil? + data.each do |key,values| + new_values = values + max_cols.times do |i| + new_values[i] ||= [""] + end + data[key] = new_values + end + end + + + # Save header information + key_field = nil + fields = nil + if header_fields && header_fields.any? + key_field = header_fields[id_pos] + if extra_pos.nil? + fields = header_fields + fields.delete_at(id_pos) + else + fields = header_fields.values_at(*extra_pos) + end + end + + data.read if PersistenceHash === data + + [key_field, fields] + end + + attr_accessor :data, :key_field, :fields, :list, :case_insensitive, :filename + def fields + fields = @fields + fields.each do |f| f.extend Field end if Array === fields + fields + end + + def initialize(file = {}, options = {}) + options = Misc.add_defaults options + options[:persistence] = true if options[:persistence_file] + + if String === file && file =~ /(.*?)#(.*)/ + file, file_options = $1, $2 + options = Misc.add_defaults file_options, options + end + + @case_insensitive = options[:case_insensitive] == true + @list = ! (options[:flatten] == true || options[:single] == true || options[:unique] == true) + + case + when TSV === file + Log.low "Copying TSV" + @filename = file.filename + + if options[:persistence] and not PersistenceHash === file.data + persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options) + Log.low "Making persistance #{ persistence_file }" + @data = TCHash.get(persistence_file) + @data.merge! file + @data.key_field = file.key_field + @data.fields = file.fields + else + @data = file.data + end + + @key_field = file.key_field + @fields = file.fields + @case_insensitive = file.case_insensitive + @list = file.list + return self + when Hash === file + Log.low "Encapsulating Hash" + @filename = "Hash:" + Digest::MD5.hexdigest(file.inspect) + if options[:persistence] + persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options) + Log.low "Making persistance #{ persistence_file }" + @data = TCHash.get(persistence_file) + @data.merge! file + else + @data = file + end + return self + when PersistenceHash === file + Log.low "Encapsulating PersistenceHash" + @filename = "PersistenceHash:" + Digest::MD5.hexdigest(file.inspect) + @data = file + @key_field = file.key_field + @fields = file.fields + return self + when File === file + @filename = File.expand_path file.path + when String === file && File.exists?(file) + @filename = File.expand_path file + file = Open.open(file) + when StringIO + else + raise "File #{file} not found" + end + + if options[:persistence] + options.delete :persistence + persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options) + + if File.exists? persistence_file + Log.low "Loading Persistence for #{ @filename } in #{persistence_file}" + @data = PersistenceHash.get(persistence_file, false) + @key_field = @data.key_field + @fields = @data.fields + else + @data = PersistenceHash.get(persistence_file, true) + file = Open.grep(file, options[:grep]) if options[:grep] + + Log.low "Persistent Parsing for #{ @filename } in #{persistence_file}" + @key_field, @fields = TSV.parse(@data, file, options.merge(:persistence_file => persistence_file)) + @data.key_field = @key_field + @data.fields = @fields + @data.read + end + else + Log.low "Non-persistent parsing for #{ @filename }" + @data = {} + file = Open.grep(file, options[:grep]) if options[:grep] + @key_field, @fields = TSV.parse(@data, file, options) + end + + file.close + @case_insensitive = options[:case_insensitive] == true + end + +end + +#{{{ CacheHelper +require 'rbbt/util/cachehelper' +module CacheHelper + def self.tsv_cache(name, key = []) + cache_file = CacheHelper.build_filename name, key + + if File.exists? cache_file + Log.debug "TSV cache file '#{cache_file}' found" + hash = TCHash.get(cache_file) + TSV.new(hash) + else + Log.debug "Producing TSV cache file '#{cache_file}'" + data = yield + TSV.new(data, :persistence_file => cache_file) + end + end end