lib/ctioga2/data/backends/backends/text.rb in ctioga2-0.0 vs lib/ctioga2/data/backends/backends/text.rb in ctioga2-0.1
- old
+ new
@@ -23,11 +23,11 @@
# For separated sets
require 'stringio'
module CTioga2
- Version::register_svn_info('$Revision: 17 $', '$Date: 2009-04-28 22:22:22 +0200 (Tue, 28 Apr 2009) $')
+ Version::register_svn_info('$Revision: 191 $', '$Date: 2010-11-07 15:53:08 +0100 (Sun, 07 Nov 2010) $')
module Data
# A module for easy use of NaN in operations
@@ -47,10 +47,11 @@
UNCOMPRESSORS = {
".gz" => "gunzip -c %s",
".bz2" => "bunzip2 -c %s",
".lzma" => "unlzma -c %s",
".lz" => "unlzma -c %s",
+ ".xz" => "unxz -c %s",
}
include Dobjects
describe 'text', 'Text format', <<EOD
@@ -74,13 +75,18 @@
param_accessor :separator, 'separator', "Data columns separator",
'regexp',
"The columns separator. Defaults to /\s+/"
+ param_accessor :param_regex, 'parameters', "Parameters parsing",
+ 'regexp',
+ "Regular expression for extracting parameters from a file. Defaults to nil (ie nothing)"
- # param_accessor :select, 'select', "Select lines", {:type => :string},
- # "Skips line where the code returns false"
+ param_accessor :header_line_regex, 'header-line',
+ 'Header line regular expression',
+ 'regexp',
+ "Regular expression indicating the header line (containing column names) (default /^##/"
def initialize
@dummy = nil
@current = nil
# Current is the name of the last file used. Necessary for '' specs.
@@ -93,15 +99,24 @@
@separator = /\s+/
# We don't split data by default.
@split = false
+ @param_regex = nil
+
+ @header_line_regex = /^\#\#\s*/
+
super()
# Override Backend's cache - for now.
@cache = {} # A cache file_name -> data
+ @param_cache = {} # Same thing as cache, but for parameters
+
+ @headers_cache = {} # Same thing as cache, but for header
+ # lines.
+
end
def extend(mod)
super
@included_modules << mod
@@ -145,24 +160,24 @@
return IO.popen($1)
elsif not File.readable?(file)
# Try to find a compressed version
for ext,method in UNCOMPRESSORS
if File.readable? "#{file}#{ext}"
- info "Using compressed file #{file}#{ext} in stead of #{file}"
+ info { "Using compressed file #{file}#{ext} in stead of #{file}" }
return IO.popen(method % "#{file}#{ext}")
end
end
else
for ext, method in UNCOMPRESSORS
if file =~ /#{ext}$/
- info "Taking file #{file} as a compressed file"
+ info { "Taking file #{file} as a compressed file" }
return IO.popen(method % file)
end
end
return File::open(file)
end
- error "Could not open #{file}"
+ error { "Could not open #{file}" }
return nil
end
# A line is invalid if it is blank or starts
# neither with a digit nor +, - or .
@@ -180,15 +195,15 @@
str = ""
line_number = 0
while line = io.gets
line_number += 1
if line =~ InvalidLineRE
- debug "Found invalid line at #{line_number}"
+ debug { "Found invalid line at #{line_number}" }
if ! last_line_is_invalid
# We begin a new set.
cur_set += 1
- debug "Found set #{cur_set} at line #{line_number}"
+ debug { "Found set #{cur_set} at line #{line_number}" }
if(cur_set > set)
return str
end
end
last_line_is_invalid = true
@@ -212,36 +227,103 @@
if $2
set = $2.to_i
else
set = 1
end
- debug "Trying to get set #{set} from file '#{filename}'"
+ debug { "Trying to get set #{set} from file '#{filename}'" }
str = get_set_string(get_io_object(filename), set)
return StringIO.new(str)
end
end
+ undef :param_regex=
+ # A proper writer for @param_regex
+ def param_regex=(val)
+ if val.is_a? Regexp
+ @param_regex = val
+ elsif val =~ /([^\\]|^)\(/ # Has capturing groups
+ @param_regex = /#{val}/
+ else # Treat as separator
+ @param_regex = /(\S+)\s*#{val}\s*(\S+)/
+ end
+ end
+
+ # Turns an array of comments into a hash[param] -> value
+ def parse_parameters(comments)
+ ret = {}
+ for line in comments
+ if line =~ @param_regex
+ ret[$1] = $2.to_f
+ end
+ end
+ return ret
+ end
+
+ # Turns an array of comments into a hash column name -> column
+ # number (1-based)
+ def parse_header_line(comments)
+ for line in comments
+ if line =~ @header_line_regex
+ colnames = line.gsub(@header_line_regex,'').split(@separator)
+ i = 1
+ ret = {}
+ for n in colnames
+ ret[n] = i
+ i += 1
+ end
+ return ret
+ end
+ end
+ return {}
+ end
+
# Reads data from a file. If needed, extract the file from the
# columns specification.
#
- # TODO: the cache really should include things such as time of
+ # \todo the cache really should include things such as time of
# last modification and various parameters that influence the
- # reading of the file.
+ # reading of the file, and the parameters read from the file
+ # using #parse_parameters
+ #
+ # \todo There should be a real global handling of meta-data
+ # extracted from files, so that they could be included for
+ # instance in the automatic labels ? (and we could have fun
+ # improving this one ?)
+ #
+ # \warning This needs Tioga r561
def read_file(file)
if file =~ /(.*)@.*/
file = $1
end
name = file # As file will be modified.
if ! @cache.key?(file) # Read the file if it is not cached.
+ comments = []
fancy_read_options = {'index_col' => true,
'skip_first' => @skip,
- 'sep' => @separator
+ 'sep' => @separator,
+ 'comment_out' => comments
}
io_set = get_io_set(file)
- debug "Fancy read '#{file}', options #{fancy_read_options.inspect}"
+ debug { "Fancy read '#{file}', options #{fancy_read_options.inspect}" }
@cache[name] = Dvector.fancy_read(io_set, nil, fancy_read_options)
+ if @param_regex
+ # Now parsing params
+ @param_cache[name] = parse_parameters(comments)
+ info { "Read #{@param_cache[name].size} parameters from #{name}" }
+ debug { "Parameters read: #{@param_cache[name].inspect}" }
+ end
+ if @header_line_regex
+ @headers_cache[name] = parse_header_line(comments)
+ info { "Read #{@headers_cache[name].size} column names from #{name}" }
+ debug { "Got: #{@headers_cache[name].inspect}" }
+ end
end
+ ## @todo These are not very satisfying; ideally, the data
+ ## information should be embedded into @cache[name] rather
+ ## than as external variables. Well...
+ @current_parameters = @param_cache[name]
+ @current_header = @headers_cache[name]
return @cache[name]
end
# This is called by the architecture to get the data. It
@@ -266,20 +348,33 @@
else
compute_formulas = false
end
return Dataset.dataset_from_spec(set, col_spec) do |col|
- get_data_column(col, compute_formulas)
+ get_data_column(col, compute_formulas,
+ @current_parameters, @current_header)
end
end
# Gets the data corresponding to the given column. If
# _compute_formulas_ is true, the column specification is
# taken to be a formula (in the spirit of gnuplot's)
- def get_data_column(column, compute_formulas = false)
+ def get_data_column(column, compute_formulas = false,
+ parameters = nil, header = nil)
if compute_formulas
- formula = column.gsub(/\$(\d+)/, 'column[\1]')
- debug "Using formula #{formula} for column spec: #{column}"
+ formula = column
+ if parameters
+ for k,v in parameters
+ formula.gsub!(/\b#{k}\b/, v.to_s)
+ end
+ end
+ formula.gsub!(/\$(\d+)/, 'column[\1]')
+ if header
+ for k,v in header
+ formula.gsub!("$#{k}$", "column[#{v}]")
+ end
+ end
+ debug { "Using formula #{formula} for column spec: #{column}" }
return Dvector.compute_formula(formula,
@current_data,
@included_modules)
else
return @current_data[column.to_i].dup