lib/rbbt/util/tsv.rb in rbbt-util-1.2.1 vs lib/rbbt/util/tsv.rb in rbbt-util-2.0.1
- old
+ new
@@ -1,1342 +1,167 @@
require 'rbbt/util/misc'
require 'rbbt/util/open'
+require 'rbbt/util/path'
require 'rbbt/util/tc_hash'
require 'rbbt/util/tmpfile'
require 'rbbt/util/log'
require 'rbbt/util/persistence'
require 'digest'
require 'fileutils'
+require 'rbbt/util/tsv/parse'
+require 'rbbt/util/tsv/accessor'
+require 'rbbt/util/tsv/manipulate'
+require 'rbbt/util/tsv/index'
+require 'rbbt/util/tsv/attach'
class TSV
- class FieldNotFoundError < StandardError;end
- module Field
- def ==(string)
- return false unless String === string
- self.sub(/#.*/,'').casecmp(string.sub(/#.*/,'')) == 0
- end
- end
-
- #{{{ Persistence
-
- CACHEDIR="/tmp/tsv_persistent_cache"
- FileUtils.mkdir CACHEDIR unless File.exist? CACHEDIR
-
- def self.cachedir=(cachedir)
- CACHEDIR.replace cachedir
- FileUtils.mkdir_p CACHEDIR unless File.exist? CACHEDIR
- end
-
- def self.cachedir
- CACHEDIR
- end
-
-
- #{{{ Headers and Field Stuff
-
def self.headers(file, options = {})
- if file =~ /(.*)#(.*)/ and File.exists? $1
- options.merge! Misc.string2hash $2
+
+ ## Remove options from filename
+ if String === file and file =~/(.*?)#(.*)/ and File.exists? $1
+ options = Misc.add_defaults options, Misc.string2hash($2)
file = $1
end
- options = Misc.add_defaults options, :sep => "\t", :header_hash => "#"
- io = Open.open(file)
- line = io.gets
- io.close
+ fields = case
+ when Open.can_open?(file)
+ Open.open(file, :grep => options[:grep]) do |f| TSV.parse_header(f, options[:sep], options[:header_hash]).values_at(0, 1).flatten end
+ when File === file
+ file = Open.grep(file, options[:grep]) if options[:grep]
+ TSV.parse_header(file, options[:sep], options[:header_hash]).values_at(0, 1).flatten
+ else
+ raise "File #{file.inspect} not found"
+ end
- if line =~ /^#{options[:header_hash]}/
- line.chomp.sub(/^#{options[:header_hash]}/,'').split(options[:sep])
- else
+ if fields.compact.empty?
nil
- end
- end
-
- def self.fields_include(key_field, fields, field)
- return true if key_field == field or fields.include? field
- return false
- end
-
- def self.field_positions(key_field, fields, *selected)
- selected.collect do |sel|
- case
- when (sel.nil? or sel == :main or sel == key_field)
- -1
- when Integer === sel
- sel
- else
- Misc.field_position fields, sel
- end
- end
- end
-
- def fields_include(field)
- return TSV.fields_include key_field, fields, field
- end
-
- def field_positions(*selected)
- return nil if selected.nil? or selected == [nil]
- TSV.field_positions(key_field, fields, *selected)
- end
-
- def fields_at(*positions)
- return nil if fields.nil?
- return nil if positions.nil? or positions == [nil]
- (fields + [key_field]).values_at(*positions)
- end
-
- #{{{ Iteration, Merging, etc
- def through(new_key_field = nil, new_fields = nil, &block)
- new_key_position = (field_positions(new_key_field) || [-1]).first
- new_fields = [new_fields] if String === new_fields
-
- if new_key_position == -1
-
- if new_fields.nil? or new_fields == fields
- each &block
- return [key_field, fields]
- else
- new_field_positions = field_positions(*new_fields)
- each do |key, values|
- if values.nil?
- yield key, nil
- else
- yield key, values.values_at(*new_field_positions)
- end
- end
- return [key_field, fields_at(*new_field_positions)]
- end
-
else
- new_field_positions = field_positions(*new_fields)
-
- new_field_names = fields_at(*new_field_positions)
- if new_field_names.nil? and fields
- new_field_names = fields.dup
- new_field_names.delete_at new_key_position
- new_field_names.unshift key_field
- end
-
- each do |key, values|
- if type == :double
- tmp_values = values + [[key]]
- else
- tmp_values = values + [key]
- end
-
- if new_field_positions.nil?
- new_values = values.dup
- new_values.delete_at new_key_position
- new_values.unshift [key]
- else
- new_values = tmp_values.values_at(*new_field_positions)
- end
-
- if not Array === tmp_values[new_key_position]
- yield tmp_values[new_key_position], NamedArray.name(new_values, new_field_names)
- else
- tmp_values[new_key_position].each do |new_key|
- if new_field_names
- yield new_key, NamedArray.name(new_values, new_field_names)
- else
- yield new_key, new_values
- end
- end
- end
- end
- return [(fields_at(new_key_position) || [nil]).first, new_field_names]
+ fields
end
end
-
- def process(field)
- through do |key, values|
- values[field].replace yield(values[field], key, values) unless values[field].nil?
- end
- end
-
- def reorder(new_key_field, new_fields = nil, options = {})
- options = Misc.add_defaults options
- return TSV.new(Persistence::TSV.get(options[:persistence_file], false), :case_insensitive => case_insensitive) if options[:persistence_file] and File.exists?(options[:persistence_file])
-
- new = {}
- new_key_field, new_fields = through new_key_field, new_fields do |key, values|
- if new[key].nil?
- new[key] = values
- else
- new[key] = new[key].zip(values)
- end
- end
-
- new.each do |key,values|
- values.each{|list| list.flatten! if Array === list}
- end
-
- if options[:persistence_file]
- reordered = TSV.new(Persistence::TSV.get(options[:persistence_file], false), :case_insensitive => case_insensitive)
- reordered.merge! new
- else
- reordered = TSV.new(new, :case_insensitive => case_insensitive)
- end
-
- reordered.key_field = new_key_field
- reordered.fields = new_fields
-
- reordered
+ def self.encapsulate_persistence(file, options)
end
- def slice(new_fields, options = {})
- reorder(:main, new_fields)
- end
-
- def add_field(name = nil)
- each do |key, values|
- self[key] = values + [yield(key, values)]
- end
-
- if fields != nil
- new_fields = fields + [name]
- self.fields = new_fields
- end
- end
-
- def select(method)
- new = TSV.new({})
- new.key_field = key_field
- new.fields = fields.dup
- new.type = type
- new.filename = filename + "#Select: #{method.inspect}"
- new.case_insensitive = case_insensitive
+ def initialize(file = {}, type = nil, options = {})
+ # Process Options
- case
- when Array === method
- through do |key, values|
- new[key] = values if ([key,values].flatten & method).any?
- end
- when Regexp === method
- through do |key, values|
- new[key] = values if [key,values].flatten.select{|v| v =~ method}.any?
- end
- when String === method
- through do |key, values|
- new[key] = values if [key,values].flatten.select{|v| v == method}.any?
- end
- when Hash === method
- key = method.keys.first
- method = method.values.first
- case
- when (Array === method and (:main == key or key_field == key))
- method.each{|item| if values = self[item]; then new[item] = values; end}
- when Array === method
- through :main, key do |key, values|
- new[key] = self[key] if (values.flatten & method).any?
- end
- when Regexp === method
- through :main, key do |key, values|
- new[key] = self[key] if values.flatten.select{|v| v =~ method}.any?
- end
- when String === method
- through :main, key do |key, values|
- new[key] = self[key] if values.flatten.select{|v| v == method}.any?
- end
- end
+ if Hash === type
+ options = type
+ type = nil
end
-
- new
- end
-
- def index(options = {})
- options = Misc.add_defaults options, :order => false, :persistence => false
-
- new, extra = Persistence.persist(filename, :Index, :tsv, options) do |filename, options|
- new = {}
- if options[:order]
- new_key_field, new_fields = through options[:target], options[:others] do |key, values|
-
- values.each_with_index do |list, i|
- next if list.nil? or list.empty?
-
- list = [list] unless Array === list
-
- list.each do |value|
- next if value.nil? or value.empty?
- value = value.downcase if options[:case_insensitive]
- new[value] ||= []
- new[value][i + 1] ||= []
- new[value][i + 1] << key
- end
- new[key] ||= []
- new[key][0] = key
- end
-
- end
-
- new.each do |key, values|
- values.flatten!
- values.compact!
- end
-
- else
- new_key_field, new_fields = through options[:target], options[:others] do |key, values|
- new[key] ||= []
- new[key] << key
- values.each do |list|
- next if list.nil?
- if Array === list
- list.each do |value|
- value = value.downcase if options[:case_insensitive]
- new[value] ||= []
- new[value] << key
- end
- else
- next if list.empty?
- value = list
- value = value.downcase if options[:case_insensitive]
- new[value] ||= []
- new[value] << key
- end
- end
- end
- end
-
- [new, {:key_field => new_key_field, :fields => new_fields, :type => :double, :case_insensitive => options[:case_insensitive]}]
+ ## Remove options from filename
+ if String === file and file =~/(.*?)#(.*)/ and File.exists? $1
+ options = Misc.add_defaults options, Misc.string2hash($2)
+ file = $1
end
- new = TSV.new(new)
- new.filename = "Index: " + filename + options.inspect
- new.fields = extra[:fields]
- new.key_field = extra[:key_field]
- new.case_insensitive = extra[:case_insensitive]
- new.type = extra[:type]
- new
- end
+ options = Misc.add_defaults options, :persistence => false, :type => type
- def smart_merge(other, match = nil, new_fields = nil)
+ # Extract Filename
- new_fields = [new_fields] if String === new_fields
- if self.fields and other.fields
- common_fields = ([self.key_field] + self.fields) & ([other.key_field] + other.fields)
- new_fields ||= ([other.key_field] + other.fields) - ([self.key_field] + self.fields)
+ file, extra = file if Array === file and file.length == 2 and Hash === file.last
- common_fields.delete match if String === match
- common_fields.delete_at match if Integer === match
+ @filename = Misc.process_options options, :filename
+ @filename ||= case
+ when Path === file
+ file
+ when (String === file and File.exists? file)
+ File.expand_path file
+ when String === file
+ file
+ when File === file
+ File.expand_path file.path
+ when TSV === file
+ File.expand_path file.filename
+ when (Persistence::TSV === file and file.filename)
+ File.expand_path file.filename
+ else
+ file.class.to_s
+ end
- this_common_field_positions = self.field_positions *common_fields
- other_common_field_positions = other.field_positions *common_fields
- other_new_field_positions = other.field_positions *new_fields
- else
- nofieldinfo = true
- end
-
+ # Process With Persistence
+ # Use filename to identify the persistence
+ # Several inputs supported
+ # Filename or File: Parsed
+ # Hash: Encapsulated, empty info
+ # TSV: Duplicate
case
- when TSV === match
- match_index = match
- matching_code_position = nil
-
- when Array === match
- match_index = match.first
- matching_code_position = field_positions(match.last).first
-
- when match =~ /^through:(.*)/
- through = $1
- if through =~ /(.*)#using:(.*)/
- through = $1
- matching_code_position = field_positions($2).first
- else
- matching_code_position = nil
- end
- index_fields = TSV.headers(through)
- target_field = index_fields.select{|field| other.fields_include field}.first
- Log.debug "Target Field: #{ target_field }"
- match_index = TSV.open_file(through).index(:field => target_field)
-
- when field_positions(match).first
- matching_code_position = field_positions(match).first
- match_index = nil
- end
-
- if matching_code_position.nil? and match_index.fields
- match_index.fields.each do |field|
- if matching_code_position = field_positions(field).first
- break
- end
- end
- end
-
- if match_index and match_index.key_field == other.key_field
- other_index = nil
+ when block_given?
+ @data, extra = Persistence.persist(@filename, :TSV, :tsv_extra, options.merge(:force_array => true)) do |filename, options| yield filename, options end
+ extra.each do |key, values|
+ self.send("#{ key }=".to_sym, values) if self.respond_to? "#{ key }=".to_sym
+ end if not extra.nil?
+
else
- other_index = (match === String and other.fields_include(match)) ?
- other.index(:other => match, :order => true) : other.index(:order => true)
- end
-
- each do |key,values|
- Log.debug "Key: #{ key }. Values: #{values * ", "}"
- if matching_code_position.nil? or matching_code_position == -1
- matching_codes = [key]
- else
- matching_codes = values[matching_code_position]
- matching_codes = [matching_codes] unless matching_codes.nil? or Array === matching_codes
- end
- Log.debug "Matching codes: #{matching_codes}"
-
- next if matching_codes.nil?
-
- matching_codes.each do |matching_code|
- if match_index
- if match_index[matching_code]
- matching_code_fix = match_index[matching_code].first
- else
- matching_code_fix = nil
- end
- else
- matching_code_fix = matching_code
- end
-
- Log.debug "Matching code (fix): #{matching_code_fix}"
- next if matching_code_fix.nil?
-
- if other_index
- Log.debug "Using other_index"
- other_codes = other_index[matching_code_fix]
- else
- other_codes = matching_code_fix
- end
- Log.debug "Other codes: #{other_codes}"
-
- next if other_codes.nil? or other_codes.empty?
- other_code = other_codes.first
-
- if nofieldinfo
- next if other[other_code].nil?
- if type == :double
- other_values = [[other_code]] + other[other_code]
- else
- other_values = [other_code] + other[other_code]
- end
- other_values.delete_if do |list|
- list = [list] unless Array === list
- list.collect{|e| case_insensitive ? e.downcase : e }.
- select{|e| case_insensitive ? e == matching_code.downcase : e == matching_code }.any?
- end
-
- new_values = values + other_values
- else
- if other[other_code].nil?
- if type == :double
- other_values = [[]] * other.fields.length
- else
- other_values = [] * other.fields.length
- end
- else
- if type == :double
- other_values = other[other_code] + [[other_code]]
- else
- other_values = other[other_code] + [other_code]
- end
- end
-
-
- new_values = values.dup
-
- if type == :double
- this_common_field_positions.zip(other_common_field_positions).each do |tpos, opos|
- new_values_tops = new_values[tpos]
-
- if other.type == :double
- new_values_tops += other_values[opos]
- else
- new_values_tops += [other_values[opos]]
- end
-
- new_values[tpos] = new_values_tops.uniq
- end
- end
-
- new_values.concat other_values.values_at *other_new_field_positions
- end
-
- self[key] = new_values
- end
- end
-
- self.fields = self.fields + new_fields unless nofieldinfo
- end
-
-
- def self.field_matches(tsv, values)
- if values.flatten.sort[0..9].compact.collect{|n| n.to_i} == (1..10).to_a
- return {}
- end
-
- key_field = tsv.key_field
- fields = tsv.fields
-
- field_values = {}
- fields.each{|field|
- field_values[field] = []
- }
-
- tsv.through do |key,entry_values|
- fields.zip(entry_values).each do |field,entry_field_values|
- field_values[field].concat entry_field_values
- end
- end
-
- field_values.each do |field,field_value_list|
- field_value_list.replace(values & field_value_list.flatten.uniq)
- end
-
- field_values[key_field] = values & tsv.keys
-
- field_values
- end
-
- def field_matches(values)
- TSV.field_matches(self, values)
- end
-
-
-
- #{{{ Helpers
-
- def self.index(file, options = {})
- options = Misc.add_defaults options, :data_persistence => true, :persistence => true
- persistence, persistence_file = Misc.process_options options, :persistence, :persistence_file
- options[:persistence], options[:persistence_file] = options.values_at :data_persistence, :data_persistence_file
- options.delete :data_persistence
- options.delete :data_persistence_file
-
- index, extra = Persistence.persist(file, :Index, :tsv, options) do |file, options, filename|
- TSV.new(file, :double, options).index
- end
- index
- end
-
- def self.index2(file, options = {})
- opt_data = options.dup
- opt_index = options.dup
- opt_data.delete :field
- opt_data.delete :persistence
- opt_index.delete :persistence
-
- opt_data[:persistence] = true if options[:data_persistence]
-
- opt_index.merge! :persistence_file => get_persistence_file(file, "index:#{ file }_#{options[:field]}:", opt_index) if options[:persistence]
-
- if ! opt_index[:persistence_file].nil? && File.exists?(opt_index[:persistence_file])
- Log.low "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}"
- TSV.new(Persistence::TSV.get(opt_index[:persistence_file], false), opt_index)
- else
- Log.low "Creating index for #{ file }: #{opt_index[:persistence_file]}"
- data = TSV.new(file, opt_data)
- data.index(opt_index)
- end
- end
-
- def self.open_file(file)
- if file =~ /(.*?)#(.*)/
- file, options = $1, Misc.string2hash($2.to_s)
- else
- options = {}
- end
-
- TSV.new(file, options)
- end
-
- #{{{ Accesor Methods
- attr_accessor :filename, :type, :case_insensitive, :key_field, :fields, :data
-
- def fields
- return nil if @fields.nil?
- fields = @fields
- fields.each do |f| f.extend Field end if Array === fields
- fields
- end
-
- def fields=(new_fields)
- @fields = new_fields
- if Persistence::TSV === @data
- @data.fields = new_fields
- end
- end
-
-
-
- def keys
- @data.keys
- end
-
- def values
- @data.values
- end
-
- def size
- @data.size
- end
-
- # Write
-
- def []=(key, value)
- key = key.downcase if @case_insensitive
- @data[key] = value
- end
-
-
- def merge!(new_data)
- new_data.each do |key, value|
- self[key] = value
- end
- end
-
- # Read
-
- def follow(value)
- return nil if value.nil?
- if String === value && value =~ /__Ref:(.*)/
- return self[$1]
- else
- value = NamedArray.name value, fields if Array === value and fields
- value
- end
- end
-
- def [](key)
- if Array === key
- return @data[key] if @data[key] != nil
- key.each{|k| v = self[k]; return v unless v.nil?}
- return nil
- end
-
- key = key.downcase if @case_insensitive and key !~ /^__Ref:/
- follow @data[key]
- end
-
- def values_at(*keys)
- keys.collect{|k|
- self[k]
- }
- end
-
- def each(&block)
- @data.each do |key, value|
- block.call(key, follow(value))
- end
- end
-
- def collect
- if block_given?
- @data.collect do |key, value|
- value = follow(value)
- key, values = yield key, value
- end
- else
- @data.collect do |key, value|
- [key, follow(value)]
- end
- end
- end
-
- def sort(&block)
- collect.sort(&block).collect{|p|
- key, value = p
- value = NamedArray.name value, fields if fields
- [key, value]
- }
- end
-
- def sort_by(&block)
- collect.sort_by &block
- end
-
- def values_to_s(values)
case
- when (values.nil? and fields.nil?)
- "\n"
- when (values.nil? and not fields.nil?)
- "\t" << ([""] * fields.length) * "\t" << "\n"
- when (not Array === values)
- "\t" << values.to_s << "\n"
- when Array === values.first
- "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
- else
- "\t" << values * "\t" << "\n"
- end
- end
-
- def to_s(keys = nil)
- str = ""
-
- if fields
- str << "#" << key_field << "\t" << fields * "\t" << "\n"
- end
-
- if keys.nil?
- each do |key, values|
- key = key.to_s if Symbol === key
- str << key.dup << values_to_s(values)
- end
- else
- keys.zip(values_at(*keys)).each do |key, values|
- key = key.to_s if Symbol === key
- str << key.dup << values_to_s(values)
- end
- end
-
- str
- end
-
- #{{{ Parsing
-
- def self.parse_fields(io, delimiter = "\t")
- return [] if io.nil?
- fields = io.split(delimiter, -1)
- fields
- end
-
- def self.zip_fields(list, fields = nil)
- return [] if list.nil? || list.empty?
- fields ||= list.fields if list.respond_to? :fields
- zipped = list[0].zip(*list[1..-1])
- zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields
- zipped
- end
-
- def self.key_order(file, options = {})
- # Prepare options
- options = add_defaults options,
- :sep => "\t",
- :sep2 => "|",
- :native => 0,
- :fix => nil,
- :exclude => nil,
- :select => nil,
- :grep => nil,
- :case_insensitive => false,
- :header_hash => '#'
-
- options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
-
- if String === file and File.exists? file
- file = File.open(file)
- end
-
- #{{{ Process first line
-
- line = file.gets
- raise "Empty content" if line.nil?
- line.chomp!
-
- if line =~ /^#{options[:header_hash]}/
- header_fields = parse_fields(line, options[:sep])
- header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
- line = file.gets
- else
- header_fields = nil
- end
-
- id_pos = Misc.field_position(header_fields, options[:native])
-
- if options[:extra].nil?
- extra_pos = nil
- max_cols = 0
- else
- extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
- end
-
- ids = []
- #{{{ Process rest
- while line do
- line.chomp!
-
- line = options[:fix].call line if options[:fix]
- break if not line
-
- # Select and fix lines
- if line.empty? or
- (options[:exclude] and options[:exclude].call(line)) or
- (options[:select] and not options[:select].call(line))
-
- line = file.gets
- next
- end
-
- ### Process line
-
- # Chunk fields
- parts = parse_fields(line, options[:sep])
-
- # Get next line
- line = file.gets
-
- # Get id field
- next if parts[id_pos].nil? || parts[id_pos].empty?
- ids << parts[id_pos]
- end
-
- ids
- end
-
- def self.parse_header(stream, sep, header_hash)
- fields, key_field = nil
- options = {}
-
- line = stream.gets
-
- if line and line =~ /^#{header_hash}: (.*)/
- options = Misc.string2hash $1
- line = stream.gets
- end
-
- sep = options[:sep] if options[:sep]
-
- if line and line =~ /^#{header_hash}/
- line.chomp!
- fields = parse_fields(line, sep)
- key_field = fields.shift
- key_field = key_field[(0 + header_hash.length)..-1] # Remove initial hash character
- line = stream.gets
- end
-
- raise "Empty content" if line.nil?
- return key_field, fields, options, line
- end
-
- def self.parse(stream, options = {})
- # Prepare options
- options = Misc.add_defaults options,
- :case_insensitive => false,
- :type => :double,
-
- :merge => false,
- :keep_empty => true,
- :cast => nil,
-
- :sep => "\t",
- :sep2 => "|",
- :header_hash => '#',
-
- :key => 0,
- :fields => nil,
-
- :fix => nil,
- :exclude => nil,
- :select => nil,
- :grep => nil
-
-
- sep, header_hash =
- Misc.process_options options, :sep, :header_hash
-
- key_field, other_fields, more_options, line = TSV.parse_header(stream, sep, header_hash)
-
- sep = more_options[:sep] if more_options[:sep]
- options = Misc.add_defaults options, more_options
- sep2 = Misc.process_options options, :sep2
-
- key, others =
- Misc.process_options options, :key, :others
-
- if key_field.nil?
- key_pos = key
- key_field, fields = nil
- else
- all_fields = [key_field].concat other_fields
-
- key_pos = Misc.field_position(all_fields, key)
-
- if String === others or Symbol === others
- others = [others]
- end
-
- if others.nil?
- other_pos = (0..(all_fields.length - 1)).to_a
- other_pos.delete key_pos
- else
- other_pos = Misc.field_position(all_fields, *others)
- end
-
- key_field = all_fields[key_pos]
- fields = all_fields.values_at *other_pos
- end
-
- case_insensitive, type, merge, keep_empty, cast =
- Misc.process_options options, :case_insensitive, :type, :merge, :keep_empty, :cast
- fix, exclude, select, grep =
- Misc.process_options options, :fix, :exclude, :select, :grep
-
- #{{{ Process rest
- data = {}
- single = type.to_sym != :double
- max_cols = 0
- while line do
- line.chomp!
-
- line = fix.call line if fix
- break if not line
-
- if header_hash and line =~ /^#{header_hash}/
- line = stream.gets
- next
- end
-
- if line.empty? or
- (exclude and exclude.call(line)) or
- (select and not select.call(line))
-
- line = stream.gets
- next
- end
-
- # Chunk fields
- parts = parse_fields(line, sep)
-
- # Get next line
- line = stream.gets
-
- # Get id field
- next if parts[key_pos].nil? || parts[key_pos].empty?
-
- if single
- ids = parse_fields(parts[key_pos], sep2)
- ids.collect!{|id| id.downcase} if case_insensitive
-
- id = ids.shift
- ids.each do |id2| data[id2] = "__Ref:#{id}" end
-
- if key_field.nil?
- other_pos = (0..(parts.length - 1)).to_a
- other_pos.delete key_pos
- end
-
- extra = parts.values_at(*other_pos).collect{|f| parse_fields(f, sep2).first}
- extra.collect! do |elem|
- case
- when String === cast
- elem.send(cast)
- when Proc === cast
- cast.call elem
+ when Hash === file
+ @data = file
+ when TSV === file
+ @data = file.data
+ when Persistence::TSV === file
+ @data = file
+ %w(case_insensitive namespace datadir fields key_field type filename cast).each do |key|
+ if @data.respond_to?(key.to_sym) and self.respond_to?("#{key}=".to_sym)
+ self.send "#{key}=".to_sym, @data.send(key.to_sym)
end
- end if cast
-
- max_cols = extra.size if extra.size > (max_cols || 0)
- case type
- when :list
- data[id] = extra unless data.include? id
- when :flat
- data[id] = extra.flatten unless data.include? id
- when :single
- data[id] = extra.flatten.first unless data.include? id
end
-
else
- ids = parse_fields(parts[key_pos], sep2)
- ids.collect!{|id| id.downcase} if case_insensitive
+ @data, extra = Persistence.persist(@filename, :TSV, :tsv_extra, options) do |file, options, filename|
+ data, extra = nil
- id = ids.shift
- ids.each do |id2| data[id2] = "__Ref:#{id}" end
-
- if key_field.nil?
- other_pos = (0..(parts.length - 1)).to_a
- other_pos.delete key_pos
- end
-
- extra = parts.values_at(*other_pos).collect{|f| parse_fields(f, sep2)}
- extra.collect! do |list|
case
- when String === cast
- list.collect{|elem| elem.send(cast)}
- when Proc === cast
- list.collect{|elem| cast.call elem}
- end
- end if cast
-
- max_cols = extra.size if extra.size > (max_cols || 0)
- if merge
- data[id] = extra unless data.include? id
- else
- if not data.include? id
- data[id] = extra
- else
- entry = data[id]
- while entry =~ /__Ref:(.*)/ do entry = data[$1] end
- extra.each_with_index do |f, i|
- if f.empty?
- next unless keep_empty
- f= [""]
- end
- entry[i] ||= []
- entry[i] = entry[i].concat f
+ ## Parse source
+ when (String === file and file.respond_to? :open)
+ data, extra = TSV.parse(file.open(:grep => options[:grep]) , options)
+ extra[:namespace] ||= file.namespace
+ extra[:datadir] ||= file.datadir
+ when StringIO === file
+ data, extra = TSV.parse(file, options)
+ when Open.can_open?(file)
+ Open.open(file, :grep => options[:grep]) do |f|
+ data, extra = TSV.parse(f, options)
end
- data[id] = entry
- end
- end
- end
- end
-
- if keep_empty and max_cols > 0
- data.each do |key, values|
- next if values =~ /__Ref:/
- new_values = values
- max_cols.times do |i|
- if type == :double
- new_values[i] = [""] if new_values[i].nil? or new_values[i].empty?
+ #extra[:namespace] = File.basename(File.dirname(filename))
+ #extra.delete :namespace if extra[:namespace].empty? or extra[:namespace] == "."
+ when File === file
+ file = Open.grep(file, options[:grep]) if options[:grep]
+ data, extra = TSV.parse(file, options)
+ extra[:namespace] = File.basename(File.dirname(file.filename))
+ extra.delete :namespace if extra[:namespace].empty? or extra[:namespace] == "."
+ ## Encapsulate Hash or TSV
+ when block_given?
+ data
else
- new_values[i] = "" if new_values[i].nil?
+ raise "Unknown input in TSV.new #{file.inspect}"
end
- end
- data[key] = new_values
- end
- end
- [data, {:key_field => key_field, :fields => fields, :type => type, :case_insensitive => case_insensitive}]
- end
-
- def self.parse2(data, file, options = {})
+ extra[:filename] = filename
- # Prepare options
- options = Misc.add_defaults options,
- :sep => "\t",
- :sep2 => "|",
- :native => 0,
- :extra => nil,
- :fix => nil,
- :exclude => nil,
- :select => nil,
- :grep => nil,
- :single => false,
- :unique => false,
- :merge => false,
- :flatten => false,
- :keep_empty => true,
- :case_insensitive => false,
- :header_hash => '#' ,
- :cast => nil,
- :persistence_file => nil
-
-
- options[:unique] = options[:uniq] if options[:unique].nil?
- options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
- options[:flatten] = true if options[:single]
-
- #{{{ Process first line
-
- line = file.gets
- raise "Empty content" if line.nil?
- line.chomp!
-
- if line =~ /^#{options[:header_hash]}/
- header_fields = parse_fields(line, options[:sep])
- header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
- line = file.gets
- else
- header_fields = nil
- end
-
- id_pos = Misc.field_position(header_fields, options[:native])
-
- if options[:extra].nil?
- extra_pos = nil
- max_cols = 0
- else
- extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
- end
-
- #{{{ Process rest
- while line do
- line.chomp!
-
- line = options[:fix].call line if options[:fix]
- break if not line
-
- if options[:header_hash] && line =~ /^#{options[:header_hash]}/
- line = file.gets
- next
- end
-
- # Select and fix lines
- if line.empty? or
- (options[:exclude] and options[:exclude].call(line)) or
- (options[:select] and not options[:select].call(line))
-
- line = file.gets
- next
- end
-
- ### Process line
-
- # Chunk fields
- parts = parse_fields(line, options[:sep])
-
- # Get next line
- line = file.gets
-
- # Get id field
- next if parts[id_pos].nil? || parts[id_pos].empty?
- ids = parse_fields(parts[id_pos], options[:sep2])
- ids.collect!{|id| id.downcase } if options[:case_insensitive]
-
- # Get extra fields
-
- if options[:extra].nil? and not (options[:flatten] or options[:single])
- extra = parts
- extra.delete_at(id_pos)
- max_cols = extra.size if extra.size > (max_cols || 0)
- else
- if extra_pos.nil?
- extra = parts
- extra.delete_at id_pos
- else
- extra = parts.values_at(*extra_pos)
+ [data, extra]
end
end
-
- extra.collect!{|value| parse_fields(value, options[:sep2])}
- extra.collect!{|values| values.first} if options[:unique]
- extra.flatten! if options[:flatten]
- extra = extra.first if options[:single]
-
- if options[:cast]
- if Array === extra[0]
- e = extra
- else
- e = [extra]
- end
-
- e.each do |list|
- case
- when String === options[:cast]
- list.collect!{|elem| elem.send(options[:cast])}
- when Proc === options[:cast]
- list.collect!{|elem| options[:cast].call elem}
- end
- end
- end
-
- main_entry = ids.shift
- ids.each do |id| data[id] = "__Ref:#{main_entry}" end
-
- case
- when (options[:single] or options[:unique] or not options[:merge])
- data[main_entry] = extra unless data.include? main_entry
- when options[:flatten]
- entry = data[main_entry]
-
- if entry.nil?
- data[main_entry] = extra
- else
- while entry =~ /__Ref:(.*)/ do entry = data[$1] end
- if Persistence::TSV === data
- data[main_entry] = entry.concat extra
- else
- data[main_entry].concat extra
- end
- end
- else
- entry = data[main_entry]
- if entry.nil?
- data[main_entry] = extra
- else
- while entry =~ /__Ref:(.*)/ do entry = data[$1] end
- extra.each_with_index do |fields, i|
- if fields.empty?
- next unless options[:keep_empty]
- fields = [""]
- end
- entry[i] ||= []
- entry[i] = entry[i].concat fields
- end
- data[main_entry] = entry
- end
- end
end
- if options[:keep_empty] and not max_cols.nil?
- data.each do |key,values|
- new_values = values
- max_cols.times do |i|
- new_values[i] ||= [""]
- end
- data[key] = new_values
- end
- end
-
- # Save header information
- key_field = nil
- fields = nil
- if header_fields && header_fields.any?
- key_field = header_fields[id_pos]
- if extra_pos.nil?
- fields = header_fields
- fields.delete_at(id_pos)
- else
- fields = header_fields.values_at(*extra_pos)
- end
- end
-
- data.read if Persistence::TSV === data
-
- [key_field, fields]
- end
- def initialize(file = {}, type = :double, options = {})
- if Hash === type
- options = type
- type = :double
- end
-
- if String === file and file =~/(.*?)#(.*)/ and File.exists? $1
- options = Misc.add_defaults options, Misc.string2hash($2)
- file = $1
- end
-
- options = Misc.add_defaults options, :persistence => false, :case_insensitive => false, :type => type
-
- @filename = Misc.process_options options, :filename
- @filename ||= case
- when (String === file and File.exists? file)
- File.expand_path file
- when File === file
- File.expand_path file.path
- else
- Digest::MD5.hexdigest(file.inspect)
- end
-
- if block_given?
- @data, extra = Persistence.persist(@filename, :TSV, :tsv, options) do |filename, options| yield filename, options end
- else
- @data, extra = Persistence.persist(@filename, :TSV, :tsv, options) do |filename, options|
- data, extra = nil
- case
- when String === file
- File.open(file) do |f|
- data, extra = TSV.parse(f, options)
+ if not extra.nil?
+ %w(case_insensitive namespace datadir fields key_field type filename cast).each do |key|
+ if extra.include? key.to_sym
+ self.send("#{key}=".to_sym, extra[key.to_sym])
+ if @data.respond_to? "#{key}=".to_sym
+ @data.send("#{key}=".to_sym, extra[key.to_sym])
end
- when File === file
- data, extra = TSV.parse(file, options)
- when Hash === file
- data = file
- extra = {:case_insensitive => options[:case_insensitive], :type => type}
end
-
- [data, extra]
- end
+ end
end
-
- @type = extra[:type]
- @key_field = extra[:key_field]
- @fields = extra[:fields]
- @case_insensitive = extra[:case_insensitive]
end
- def initialize2(file = {}, options = {})
- options = Misc.add_defaults options
- options[:persistence] = true if options[:persistence_file]
-
- if String === file && file =~ /(.*?)#(.*)/
- file, file_options = $1, $2
- options = Misc.add_defaults file_options, options
- end
-
- @case_insensitive = options[:case_insensitive] == true
- @list = ! (options[:flatten] == true || options[:single] == true || options[:unique] == true)
-
- case
- when TSV === file
- Log.low "Copying TSV"
- @filename = file.filename
-
- if options[:persistence] and not Persistence::TSV === file.data
- persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
- Log.low "Making persistance #{ persistence_file }"
- @data = TCHash.get(persistence_file)
- @data.merge! file
- @data.key_field = file.key_field
- @data.fields = file.fields
- else
- @data = file.data
- end
-
- @key_field = file.key_field
- @fields = file.fields
- @case_insensitive = file.case_insensitive
- @list = file.list
- return self
- when Hash === file
- Log.low "Encapsulating Hash in TSV object"
- @filename = "Hash:" + Digest::MD5.hexdigest(file.inspect)
- if options[:persistence]
- persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
- Log.low "Making persistance #{ persistence_file }"
- @data = TCHash.get(persistence_file)
- @data.merge! file
- else
- @data = file
- end
- return self
- when Persistence::TSV === file
- Log.low "Encapsulating Persistence::TSV"
- @filename = "Persistence::TSV:" + Digest::MD5.hexdigest(file.inspect)
- @data = file
- @key_field = file.key_field
- @fields = file.fields
- return self
- when File === file
- @filename = File.expand_path file.path
- when String === file && File.exists?(file)
- @filename = File.expand_path file
- file = Open.open(file)
- when StringIO
- else
- raise "File #{file} not found"
- end
-
- if options[:persistence]
- options.delete :persistence
- persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
-
- if File.exists? persistence_file
- Log.low "Loading Persistence for #{ @filename } in #{persistence_file}"
- @data = Persistence::TSV.get(persistence_file, false)
- @key_field = @data.key_field
- @fields = @data.fields
- else
- @data = Persistence::TSV.get(persistence_file, true)
- file = Open.grep(file, options[:grep]) if options[:grep]
-
- Log.low "Persistent Parsing for #{ @filename } in #{persistence_file}"
- @key_field, @fields = TSV.parse(@data, file, options.merge(:persistence_file => persistence_file))
- @data.key_field = @key_field
- @data.fields = @fields
- @data.read
- end
- else
- Log.low "Non-persistent parsing for #{ @filename }"
- @data = {}
- file = Open.grep(file, options[:grep]) if options[:grep]
- @key_field, @fields = TSV.parse(@data, file, options)
- end
-
- file.close
- @case_insensitive = options[:case_insensitive] == true
+ def write
+ @data.write if @data.respond_to? :write
end
-end
-
-#{{{ CacheHelper
-require 'rbbt/util/cachehelper'
-module CacheHelper
- def self.tsv_cache(name, key = [])
- cache_file = CacheHelper.build_filename name, key
-
- if File.exists? cache_file
- Log.debug "TSV cache file '#{cache_file}' found"
- hash = TCHash.get(cache_file)
- TSV.new(hash)
- else
- Log.debug "Producing TSV cache file '#{cache_file}'"
- data = yield
- TSV.new(data, :persistence_file => cache_file)
- end
+ def read
+ @data.read if @data.respond_to? :read
end
+
end