text.rb in ctioga2-0.1

- old
+ new

@@ -23,11 +23,11 @@
 # For separated sets
 require 'stringio'
 
 module CTioga2
 
-  Version::register_svn_info('$Revision: 17 $', '$Date: 2009-04-28 22:22:22 +0200 (Tue, 28 Apr 2009) $')
+  Version::register_svn_info('$Revision: 191 $', '$Date: 2010-11-07 15:53:08 +0100 (Sun, 07 Nov 2010) $')
 
 
   module Data
 
     # A module for easy use of NaN in operations
@@ -47,10 +47,11 @@
         UNCOMPRESSORS = {
           ".gz" => "gunzip -c %s",
           ".bz2" => "bunzip2 -c %s",
           ".lzma" => "unlzma -c %s",
           ".lz" => "unlzma -c %s",
+          ".xz" => "unxz -c %s",
         }
 
         include Dobjects
 
         describe 'text', 'Text format', <<EOD
@@ -74,13 +75,18 @@
 
         param_accessor :separator, 'separator', "Data columns separator", 
         'regexp', 
         "The columns separator. Defaults to /\s+/"
 
+        param_accessor :param_regex, 'parameters', "Parameters parsing", 
+        'regexp', 
+        "Regular expression for extracting parameters from a file. Defaults to nil (ie nothing)"
 
-        #     param_accessor :select, 'select', "Select lines", {:type => :string},
-        #     "Skips line where the code returns false"
+        param_accessor :header_line_regex, 'header-line', 
+        'Header line regular expression', 
+        'regexp', 
+        "Regular expression indicating the header line (containing column names) (default /^##/"
         
         def initialize
           @dummy = nil
           @current = nil   
           # Current is the name of the last file used. Necessary for '' specs.
@@ -93,15 +99,24 @@
           @separator = /\s+/
 
           # We don't split data by default.
           @split = false
 
+          @param_regex = nil
+
+          @header_line_regex = /^\#\#\s*/
+
           super()
 
           # Override Backend's cache - for now.
           @cache = {}               # A cache file_name -> data
 
+          @param_cache = {}     # Same thing as cache, but for parameters
+
+          @headers_cache = {}   # Same thing as cache, but for header
+                                # lines.
+
         end
 
         def extend(mod)
           super
           @included_modules << mod
@@ -145,24 +160,24 @@
             return IO.popen($1)
           elsif not File.readable?(file)
             # Try to find a compressed version
             for ext,method in UNCOMPRESSORS
               if File.readable? "#{file}#{ext}"
-                info "Using compressed file #{file}#{ext} in stead of #{file}"
+                info { "Using compressed file #{file}#{ext} in stead of #{file}" }
                 return IO.popen(method % "#{file}#{ext}")
               end
             end
           else 
             for ext, method in UNCOMPRESSORS
               if file =~ /#{ext}$/ 
-                info "Taking file #{file} as a compressed file"
+                info { "Taking file #{file} as a compressed file" }
                 return IO.popen(method % file)
               end
             end
             return File::open(file)
           end
-          error "Could not open #{file}"
+          error { "Could not open #{file}" }
           return nil
         end
 
         # A line is invalid if it is blank or starts
         # neither with a digit nor +, - or .
@@ -180,15 +195,15 @@
           str = ""
           line_number = 0
           while line = io.gets
             line_number += 1
             if line =~ InvalidLineRE
-              debug "Found invalid line at #{line_number}"
+              debug { "Found invalid line at #{line_number}" }
               if ! last_line_is_invalid
                 # We begin a new set.
                 cur_set += 1
-                debug "Found set #{cur_set} at line #{line_number}"
+                debug { "Found set #{cur_set} at line #{line_number}" }
                 if(cur_set > set)
                   return str
                 end
               end
               last_line_is_invalid = true
@@ -212,36 +227,103 @@
             if $2
               set = $2.to_i
             else
               set = 1
             end
-            debug "Trying to get set #{set} from file '#{filename}'"
+            debug { "Trying to get set #{set} from file '#{filename}'" }
             str = get_set_string(get_io_object(filename), set)
             return StringIO.new(str)
           end
         end
 
+        undef :param_regex=
+        # A proper writer for @param_regex
+        def param_regex=(val)
+          if val.is_a? Regexp
+            @param_regex = val
+          elsif val =~ /([^\\]|^)\(/     # Has capturing groups
+            @param_regex = /#{val}/
+          else                  # Treat as separator
+            @param_regex = /(\S+)\s*#{val}\s*(\S+)/
+          end
+        end
+
+        # Turns an array of comments into a hash[param] -> value
+        def parse_parameters(comments)
+          ret = {}
+          for line in comments
+            if line =~ @param_regex
+              ret[$1] = $2.to_f
+            end
+          end
+          return ret
+        end
+
+        # Turns an array of comments into a hash column name -> column
+        # number (1-based)
+        def parse_header_line(comments)
+          for line in comments
+            if line =~ @header_line_regex
+              colnames = line.gsub(@header_line_regex,'').split(@separator)
+              i = 1
+              ret = {}
+              for n in colnames
+                ret[n] = i
+                i += 1
+              end
+              return ret
+            end
+          end
+          return {}
+        end
+
         # Reads data from a file. If needed, extract the file from the
         # columns specification.
         #
-        # TODO: the cache really should include things such as time of
+        # \todo the cache really should include things such as time of
         # last modification and various parameters that influence the
-        # reading of the file.
+        # reading of the file, and the parameters read from the file
+        # using #parse_parameters
+        #
+        # \todo There should be a real global handling of meta-data
+        # extracted from files, so that they could be included for
+        # instance in the automatic labels ? (and we could have fun
+        # improving this one ?)
+        #
+        # \warning This needs Tioga r561
         def read_file(file)
           if file =~ /(.*)@.*/
             file = $1
           end
           name = file               # As file will be modified.
           if ! @cache.key?(file)    # Read the file if it is not cached.
+            comments = []
             fancy_read_options = {'index_col' => true,
               'skip_first' => @skip,
-              'sep' => @separator
+              'sep' => @separator,
+              'comment_out' => comments
             }
             io_set = get_io_set(file)
-            debug "Fancy read '#{file}', options #{fancy_read_options.inspect}"
+            debug { "Fancy read '#{file}', options #{fancy_read_options.inspect}" }
             @cache[name] = Dvector.fancy_read(io_set, nil, fancy_read_options)
+            if @param_regex
+              # Now parsing params
+              @param_cache[name] = parse_parameters(comments)
+              info { "Read #{@param_cache[name].size} parameters from #{name}" }
+              debug { "Parameters read: #{@param_cache[name].inspect}" }
+            end
+            if @header_line_regex
+              @headers_cache[name] = parse_header_line(comments)
+              info { "Read #{@headers_cache[name].size} column names from #{name}" }
+              debug { "Got: #{@headers_cache[name].inspect}" }
+            end
           end
+          ## @todo These are not very satisfying; ideally, the data
+          ## information should be embedded into @cache[name] rather
+          ## than as external variables. Well...
+          @current_parameters = @param_cache[name]
+          @current_header = @headers_cache[name]
           return @cache[name]
         end
 
 
         # This is called by the architecture to get the data. It
@@ -266,20 +348,33 @@
           else
             compute_formulas = false
           end
           
           return Dataset.dataset_from_spec(set, col_spec) do |col|
-            get_data_column(col, compute_formulas)
+            get_data_column(col, compute_formulas, 
+                            @current_parameters, @current_header)
           end
         end
 
         # Gets the data corresponding to the given column. If
         # _compute_formulas_ is true, the column specification is
         # taken to be a formula (in the spirit of gnuplot's)
-        def get_data_column(column, compute_formulas = false)
+        def get_data_column(column, compute_formulas = false, 
+                            parameters = nil, header = nil)
           if compute_formulas
-            formula = column.gsub(/\$(\d+)/, 'column[\1]')
-            debug "Using formula #{formula} for column spec: #{column}"
+            formula = column
+            if parameters
+              for k,v in parameters
+                formula.gsub!(/\b#{k}\b/, v.to_s)
+              end
+            end
+            formula.gsub!(/\$(\d+)/, 'column[\1]')
+            if header
+              for k,v in header
+                formula.gsub!("$#{k}$", "column[#{v}]")
+              end
+            end
+            debug { "Using formula #{formula} for column spec: #{column}" }
             return Dvector.compute_formula(formula, 
                                            @current_data,
                                            @included_modules)
           else
             return @current_data[column.to_i].dup