lib/ctioga2/data/backends/backends/text.rb in ctioga2-0.0 vs lib/ctioga2/data/backends/backends/text.rb in ctioga2-0.1

- old
+ new

@@ -23,11 +23,11 @@ # For separated sets require 'stringio' module CTioga2 - Version::register_svn_info('$Revision: 17 $', '$Date: 2009-04-28 22:22:22 +0200 (Tue, 28 Apr 2009) $') + Version::register_svn_info('$Revision: 191 $', '$Date: 2010-11-07 15:53:08 +0100 (Sun, 07 Nov 2010) $') module Data # A module for easy use of NaN in operations @@ -47,10 +47,11 @@ UNCOMPRESSORS = { ".gz" => "gunzip -c %s", ".bz2" => "bunzip2 -c %s", ".lzma" => "unlzma -c %s", ".lz" => "unlzma -c %s", + ".xz" => "unxz -c %s", } include Dobjects describe 'text', 'Text format', <<EOD @@ -74,13 +75,18 @@ param_accessor :separator, 'separator', "Data columns separator", 'regexp', "The columns separator. Defaults to /\s+/" + param_accessor :param_regex, 'parameters', "Parameters parsing", + 'regexp', + "Regular expression for extracting parameters from a file. Defaults to nil (ie nothing)" - # param_accessor :select, 'select', "Select lines", {:type => :string}, - # "Skips line where the code returns false" + param_accessor :header_line_regex, 'header-line', + 'Header line regular expression', + 'regexp', + "Regular expression indicating the header line (containing column names) (default /^##/" def initialize @dummy = nil @current = nil # Current is the name of the last file used. Necessary for '' specs. @@ -93,15 +99,24 @@ @separator = /\s+/ # We don't split data by default. @split = false + @param_regex = nil + + @header_line_regex = /^\#\#\s*/ + super() # Override Backend's cache - for now. @cache = {} # A cache file_name -> data + @param_cache = {} # Same thing as cache, but for parameters + + @headers_cache = {} # Same thing as cache, but for header + # lines. + end def extend(mod) super @included_modules << mod @@ -145,24 +160,24 @@ return IO.popen($1) elsif not File.readable?(file) # Try to find a compressed version for ext,method in UNCOMPRESSORS if File.readable? "#{file}#{ext}" - info "Using compressed file #{file}#{ext} in stead of #{file}" + info { "Using compressed file #{file}#{ext} in stead of #{file}" } return IO.popen(method % "#{file}#{ext}") end end else for ext, method in UNCOMPRESSORS if file =~ /#{ext}$/ - info "Taking file #{file} as a compressed file" + info { "Taking file #{file} as a compressed file" } return IO.popen(method % file) end end return File::open(file) end - error "Could not open #{file}" + error { "Could not open #{file}" } return nil end # A line is invalid if it is blank or starts # neither with a digit nor +, - or . @@ -180,15 +195,15 @@ str = "" line_number = 0 while line = io.gets line_number += 1 if line =~ InvalidLineRE - debug "Found invalid line at #{line_number}" + debug { "Found invalid line at #{line_number}" } if ! last_line_is_invalid # We begin a new set. cur_set += 1 - debug "Found set #{cur_set} at line #{line_number}" + debug { "Found set #{cur_set} at line #{line_number}" } if(cur_set > set) return str end end last_line_is_invalid = true @@ -212,36 +227,103 @@ if $2 set = $2.to_i else set = 1 end - debug "Trying to get set #{set} from file '#{filename}'" + debug { "Trying to get set #{set} from file '#{filename}'" } str = get_set_string(get_io_object(filename), set) return StringIO.new(str) end end + undef :param_regex= + # A proper writer for @param_regex + def param_regex=(val) + if val.is_a? Regexp + @param_regex = val + elsif val =~ /([^\\]|^)\(/ # Has capturing groups + @param_regex = /#{val}/ + else # Treat as separator + @param_regex = /(\S+)\s*#{val}\s*(\S+)/ + end + end + + # Turns an array of comments into a hash[param] -> value + def parse_parameters(comments) + ret = {} + for line in comments + if line =~ @param_regex + ret[$1] = $2.to_f + end + end + return ret + end + + # Turns an array of comments into a hash column name -> column + # number (1-based) + def parse_header_line(comments) + for line in comments + if line =~ @header_line_regex + colnames = line.gsub(@header_line_regex,'').split(@separator) + i = 1 + ret = {} + for n in colnames + ret[n] = i + i += 1 + end + return ret + end + end + return {} + end + # Reads data from a file. If needed, extract the file from the # columns specification. # - # TODO: the cache really should include things such as time of + # \todo the cache really should include things such as time of # last modification and various parameters that influence the - # reading of the file. + # reading of the file, and the parameters read from the file + # using #parse_parameters + # + # \todo There should be a real global handling of meta-data + # extracted from files, so that they could be included for + # instance in the automatic labels ? (and we could have fun + # improving this one ?) + # + # \warning This needs Tioga r561 def read_file(file) if file =~ /(.*)@.*/ file = $1 end name = file # As file will be modified. if ! @cache.key?(file) # Read the file if it is not cached. + comments = [] fancy_read_options = {'index_col' => true, 'skip_first' => @skip, - 'sep' => @separator + 'sep' => @separator, + 'comment_out' => comments } io_set = get_io_set(file) - debug "Fancy read '#{file}', options #{fancy_read_options.inspect}" + debug { "Fancy read '#{file}', options #{fancy_read_options.inspect}" } @cache[name] = Dvector.fancy_read(io_set, nil, fancy_read_options) + if @param_regex + # Now parsing params + @param_cache[name] = parse_parameters(comments) + info { "Read #{@param_cache[name].size} parameters from #{name}" } + debug { "Parameters read: #{@param_cache[name].inspect}" } + end + if @header_line_regex + @headers_cache[name] = parse_header_line(comments) + info { "Read #{@headers_cache[name].size} column names from #{name}" } + debug { "Got: #{@headers_cache[name].inspect}" } + end end + ## @todo These are not very satisfying; ideally, the data + ## information should be embedded into @cache[name] rather + ## than as external variables. Well... + @current_parameters = @param_cache[name] + @current_header = @headers_cache[name] return @cache[name] end # This is called by the architecture to get the data. It @@ -266,20 +348,33 @@ else compute_formulas = false end return Dataset.dataset_from_spec(set, col_spec) do |col| - get_data_column(col, compute_formulas) + get_data_column(col, compute_formulas, + @current_parameters, @current_header) end end # Gets the data corresponding to the given column. If # _compute_formulas_ is true, the column specification is # taken to be a formula (in the spirit of gnuplot's) - def get_data_column(column, compute_formulas = false) + def get_data_column(column, compute_formulas = false, + parameters = nil, header = nil) if compute_formulas - formula = column.gsub(/\$(\d+)/, 'column[\1]') - debug "Using formula #{formula} for column spec: #{column}" + formula = column + if parameters + for k,v in parameters + formula.gsub!(/\b#{k}\b/, v.to_s) + end + end + formula.gsub!(/\$(\d+)/, 'column[\1]') + if header + for k,v in header + formula.gsub!("$#{k}$", "column[#{v}]") + end + end + debug { "Using formula #{formula} for column spec: #{column}" } return Dvector.compute_formula(formula, @current_data, @included_modules) else return @current_data[column.to_i].dup