dvector.c in tioga-1.7

- old
+ new

@@ -308,10 +308,27 @@
    /* we set dirty to 0 */
    d->dirty = 0;
    return ary;
 }
 
+/* Makes a Dvector with the given data. No additional capacity. */
+PRIVATE VALUE make_dvector_from_data(VALUE klass, long len, double * data) {
+  VALUE ary = dvector_alloc(klass);
+  Dvector *d = Get_Dvector(ary);
+  if (len < 0) {
+    rb_raise(rb_eArgError, "negative dvector size (or size too big)");
+  }
+  d->len = len;
+  if (len == 0) len++;
+  d->ptr = ALLOC_N(double, len);
+  MEMCPY(d->ptr, data, double, len);
+  d->capa = len;
+  /* we set dirty to 0 */
+  d->dirty = 0;
+  return ary;
+}
+
 PRIVATE VALUE dvector_new2(long len, long capa) {
    return make_new_dvector(cDvector, len, capa);
 }
 
 PRIVATE VALUE dvector_new() {
@@ -1428,11 +1445,11 @@
       StringValue(sep);
       len += RSTRING_LEN(sep) * (d->len - 1); /* So it works for ruby 1.9 */
    }
    result = rb_str_buf_new(len);
    for (i=0; i < d->len; i++) {
-      sprintf(buff, "%g", d->ptr[i]);
+      snprintf(buff,sizeof(buff), "%g", d->ptr[i]);
       tmp = rb_str_new2(buff);
       if (i > 0 && !NIL_P(sep)) rb_str_buf_append(result, sep);
       rb_str_buf_append(result, tmp);
    }
    if (taint) OBJ_TAINT(result);
@@ -2095,23 +2112,16 @@
  *     a                               -> Dvector[ -1, -2, -3 ]
  */ 
 VALUE dvector_replace(VALUE dest, VALUE orig) {
    VALUE shared;
    Dvector *org, *d;
-   dvector_modify(dest);
+   dvector_modify(dest); // take care of any sharing issues.
    orig = dvector_to_dvector(orig); /* it might be some kind of Array rather than a Dvector */
    if (dest == orig) return dest;
    org = Get_Dvector(orig);
    d = Get_Dvector(dest);
-   if (d->ptr) {
-      if (0 && d->capa >= org->len && d->shared == Qnil) {
-         d->len = org->len;
-         MEMCPY(d->ptr, org->ptr, double, d->len);
-         return dest;
-      }
-      free(d->ptr);
-   }
+   if (d->ptr) free(d->ptr); // we know it isn't shared because we did dvector_modify above
    shared = dvector_make_shared(orig);
    org = Get_Dvector(shared);
    d->ptr = org->ptr;
    d->len = org->len;
    d->shared = shared;
@@ -4319,20 +4329,63 @@
  */ 
 VALUE dvector_div_bang(VALUE ary, VALUE arg) {
    return dvector_apply_math_op2_bang(ary, arg, do_div);
 }
 
+static char *fill_read_buffer(char **buff_ptr, int *len_ptr, FILE *file) {
+   char *buff, *new_buff;
+   buff = *buff_ptr;
+   int len, i, max_tries, line_len;
+   long filepos = ftell(file);
+   if (filepos == -1) {
+      printf("ftell failed\n");
+      return NULL;
+   }
+   max_tries = 10;
+   for (i = 0; i < max_tries; i++) {
+      len = *len_ptr;
+      buff[len-1] = '1'; // mark the last character position
+      buff = fgets(buff, len, file);
+      if (buff == NULL) return NULL; // end of file
+      if (buff[len-1] != '\0') {
+         if (0) {
+            line_len = strlen(buff);
+            printf("len %i line_len %i\n", len, line_len);
+            if (line_len < 80) {
+               printf("line buff contains: %s\n", buff);
+            } else {
+               printf("line buff ends with: %s\n", buff+line_len-80);
+            }
+         }
+         return buff;
+      }
+      // ran out of room -- make buffer larger and try again
+      len = 10*len + 100;
+      //printf("fill_read_buffer ran out of room -- increase buffer len to %i and try again\n", len);
+      *len_ptr = len;
+      new_buff = (char *)realloc(buff, len);
+      if (new_buff == NULL) break;
+      buff = new_buff;
+      *buff_ptr = buff;
+      if (fseek(file, filepos, SEEK_SET) != 0) {
+         printf("fseek failed\n");
+         return NULL;
+      }
+   }
+   return NULL;
+}
+
 PRIVATE
 /*======================================================================*/ 
 VALUE Read_Dvectors(char *filename, VALUE destinations, int first_row_of_file, int number_of_rows) {
    FILE *file = NULL;
    VALUE col_obj, cols_obj, *cols_ptr = NULL;
    Dvector *d;
    double v;
    int last_row_of_file;
-   const int buff_len = 10000;
-   char buff[buff_len], *num_str, *pend, c, *cptr;
+   int buff_len = 100;
+   char *buff, *num_str, *pend, c, *cptr;
    int num_cols = 0, i, row, col, buff_loc, skip = first_row_of_file - 1;
    last_row_of_file = (number_of_rows == -1)? -1 : first_row_of_file + number_of_rows - 1;
    if ((last_row_of_file != -1 && last_row_of_file < first_row_of_file) || filename == NULL) return false;
    if (destinations != Qnil) {
       cols_obj = rb_Array(destinations);
@@ -4353,19 +4406,25 @@
       }
    }
    if ((file=fopen(filename,"r")) == NULL) {
       rb_raise(rb_eArgError, "ERROR: read cannot open %s", filename);
    }
+   buff = (char *)malloc(buff_len);
+   if (buff == NULL) {
+      fclose(file);
+      rb_raise(rb_eArgError, "ERROR: allocation of read buffer failed");
+   }
    for (i = 0; i < skip; i++) { /* skip over initial lines */
-      if (fgets(buff, buff_len, file)==NULL) {
+      if (fill_read_buffer(&buff, &buff_len, file)==NULL) {
          fclose(file);
+         free(buff);
          rb_raise(rb_eArgError, "ERROR: read reached end of file before reaching line %i in %s",
             first_row_of_file, filename);
       }
    }
    for (row = 0, i = first_row_of_file; last_row_of_file == -1 || i <= last_row_of_file; row++, i++) {
-      if (fgets(buff, buff_len, file)==NULL) break; /* have reached end of file */
+      if (fill_read_buffer(&buff, &buff_len, file)==NULL) break; /* have reached end of file */
       if (destinations == Qnil) { /* create destinations */
          buff_loc = 0;
          while (true) {
             while (isspace(buff[buff_loc])) buff_loc++; /* skip leading blanks */
             if (buff[buff_loc] == '\0') break;
@@ -4384,16 +4443,18 @@
       buff_loc = 0;
       for (col = 0; col < num_cols; col++) {
          while (isspace(buff[buff_loc])) buff_loc++; /* skip leading blanks */
          if (buff[buff_loc] == '\0') {
             fclose(file);
+            free(buff);
             rb_raise(rb_eArgError, "read reached end of line looking for column %i in line %i of %s", col+1, i, filename);
          }
          num_str = buff+buff_loc;
          while (isgraph(buff[buff_loc])) buff_loc++; /* include non-blanks */
          if (buff[buff_loc] == '\0') {
             fclose(file);
+            free(buff);
             rb_raise(rb_eArgError, "ERROR: read reached end of line looking for column %i in line %i of %s", col+1, i, filename);
          }
          col_obj = cols_ptr[col];
          if (col_obj == Qnil) continue;
          Data_Get_Struct(col_obj, Dvector, d);
@@ -4408,18 +4469,20 @@
                 cptr = pend+5; c = *cptr; pend[5] = ' '; pend[4] = pend[3]; pend[3] = pend[2]; 
                 pend[2] = pend[1]; pend[1] = pend[0]; pend[0] = 'E';
                 v = strtod(num_str,&pend); *cptr = c; buff_loc = pend - buff;
             } else {
                 fclose(file);
+                free(buff);
                 pend[0] = 0;
                 rb_raise(rb_eArgError, "ERROR: unreadable value in file %s in line %i: %s", filename, i , buff+buff_loc);
             }
          }
          }
          
          if (!is_okay_number(v)) {
             fclose(file);
+            free(buff);
             rb_raise(rb_eArgError, "ERROR: bad value %g in line %i of %s -- %s", v, i, filename, num_str);
          }
          if (row >= d->capa) 
             Dvector_Store_Double(col_obj, row, v);
          else {
@@ -4428,10 +4491,11 @@
             d->ptr[row] = v;
          }
       }
    }
    fclose(file);
+   free(buff);
    return destinations;
 }
 
 PRIVATE 
 /*
@@ -4465,12 +4529,12 @@
 VALUE Read_Rows_of_Dvectors(char *filename, VALUE destinations, int first_row_of_file) {
    FILE *file = NULL;
    VALUE row_obj, rows_obj, *rows_ptr = NULL;
    Dvector *d;
    double v, *row_data;
-   const int buff_len = 10000;
-   char buff[buff_len], *num_str, *pend, c, *cptr;
+   int buff_len = 1000;
+   char *buff, *num_str, *pend, c, *cptr;
    int num_rows = 0, i, row, col, buff_loc, c_loc, skip = first_row_of_file - 1;
    rows_obj = rb_Array(destinations);
    num_rows = RARRAY(rows_obj)->len;
    rows_ptr = RARRAY(rows_obj)->ptr;
    for (i = 0; i < num_rows; i++) { /* first pass to check validity */
@@ -4487,20 +4551,27 @@
       d->len = 0;
    }
    if ((file=fopen(filename,"r")) == NULL) {
       rb_raise(rb_eArgError, "ERROR: read_rows cannot open %s", filename);
    }
+   buff = (char *)malloc(buff_len);
+   if (buff == NULL) {
+      fclose(file);
+      rb_raise(rb_eArgError, "ERROR: allocation of read buffer failed");
+   }
    for (i = 0; i < skip; i++) { /* skip over initial lines */
-      if (fgets(buff, buff_len, file)==NULL) {
+      if (fill_read_buffer(&buff, &buff_len, file)==NULL) {
          fclose(file);
+         free(buff);
          rb_raise(rb_eArgError, "ERROR: read_rows reached end of file before reaching line %i in %s",
             first_row_of_file, filename);
       }
    }
    for (row = 0, i = first_row_of_file; row < num_rows; row++, i++) {
-      if (fgets(buff, buff_len, file)==NULL) {
+      if (fill_read_buffer(&buff, &buff_len, file)==NULL) {
          fclose(file);
+         free(buff);
          rb_raise(rb_eArgError, "ERROR: read_rows reached end of file at line %i in %s", i, filename);
       }
       row_obj = rows_ptr[row];
       if (row_obj == Qnil) continue;
       d = Get_Dvector(row_obj);
@@ -4527,10 +4598,11 @@
             }
          }
 
          if (!is_okay_number(v)) {
             fclose(file);
+            free(buff);
             rb_raise(rb_eArgError, "ERROR: bad value %g in line i% of file %s", v, i, filename);
          }
          if (col < d->capa) { row_data[col] = v; d->len = col+1; }
          else {
             Dvector_Store_Double(row_obj, col, v);
@@ -4542,10 +4614,11 @@
          REALLOC_N(d->ptr, double, col);
          d->capa = col;
       }
    }
    fclose(file);
+   free(buff);
    return destinations;
 }
 
 
 PRIVATE
@@ -4567,34 +4640,42 @@
       rb_raise(rb_eArgError, "wrong # of arguments(%d) for read_rows", argc);
    if (argc > 2) arg3 = NUM2INT(argv[2]);
    return Read_Rows_of_Dvectors(StringValueCStr(argv[0]),argv[1],arg3);
    klass = Qnil;
 }
+
 PRIVATE 
 VALUE Read_Row(char *filename, int row, VALUE row_ary) {
    FILE *file = NULL;
-   const int buff_len = 10000;
-   char buff[buff_len], *num_str, *pend, c, *cptr;
+   int buff_len = 1000;
+   char *buff, *num_str, *pend, c, *cptr;
    int i, col, buff_loc;
    double v;
    if (row <= 0) {
       rb_raise(rb_eArgError, "ERROR: read_row line must be positive (not %i) for file %s", row, filename);
    }
    if (filename == NULL || (file=fopen(filename,"r")) == NULL) {
       rb_raise(rb_eArgError, "ERROR: read_row cannot open %s", filename);
    }
+   buff = (char *)malloc(buff_len);
+   if (buff == NULL) {
+      fclose(file);
+      rb_raise(rb_eArgError, "ERROR: allocation of read buffer failed");
+   }
    for (i = 0; i < row; i++) { /* read lines until reach desired row */
-      if (fgets(buff, buff_len, file)==NULL) {
+      if (fill_read_buffer(&buff, &buff_len, file)==NULL) {
          fclose(file);
+         free(buff);
          rb_raise(rb_eArgError, "ERROR: read_row reached end of file before reaching line %i in %s",
             row, filename);
       }
    }
    if (row_ary == Qnil) row_ary = dvector_new();
    else if (is_a_dvector(row_ary)) dvector_clear(row_ary);
    else {
       fclose(file);
+      free(buff);
       rb_raise(rb_eArgError, "ERROR: destination for read_row must be a Dvector");
    }
    buff_loc = 0;
    for (col = 0; ; col++) {
       while (isspace(buff[buff_loc])) buff_loc++; /* skip leading blanks */
@@ -4612,23 +4693,26 @@
                 cptr = pend+5; c = *cptr; pend[5] = ' '; pend[4] = pend[3]; pend[3] = pend[2]; 
                 pend[2] = pend[1]; pend[1] = pend[0]; pend[0] = 'E';
                 v = strtod(num_str,&pend); *cptr = c; buff_loc = pend - buff;
             } else {
                 fclose(file);
+                free(buff);
                 pend[0] = 0;
                 rb_raise(rb_eArgError, "ERROR: unreadable value in file %s in line %i: %s", filename, i , buff+buff_loc);
             }
          }
       }
 
       if (!is_okay_number(v)) {
          fclose(file);
+         free(buff);
          rb_raise(rb_eArgError, "ERROR: bad value %g in line %i of file %s", v, i, filename);
       }
       Dvector_Store_Double(row_ary, col, v);
    }
    fclose(file);
+   free(buff);
    return row_ary;
 }
 
 PRIVATE
 /*
@@ -5159,10 +5243,162 @@
 	}
     }
   return retval;
 }
 
+
+/*
+  :call-seq:
+  Dvector.fast_fancy_read(stream, options) => Array_of_Dvectors
+  
+  Reads data from an IO stream and separate it into columns of data
+  according to the _options_, a hash holding the following elements
+  (compulsory, but you can use FANCY_READ_DEFAULTS):
+  * 'sep': a regular expression that separate the entries
+  * 'comments': any line matching this will be skipped
+  * 'skip_first': skips that many lines before reading anything
+  * 'index_col': if true, the first column returned contains the
+    number of the line read
+  * 'remove_space': whether to remove spaces at the beginning of a line. *This 
+    option is currently not implemented !*
+  * 'default':  what to put when nothing was found but a number must be used
+
+  As a side note, the read time is highly non-linear, which suggests that
+  the read is memory-allocation/copying-limited, at least for big files.
+  Well, the read time is non-linear for 
+
+
+  An internal memory allocation with aggressive policy should solve that,
+  that is, not using directly Dvectors (and it would be way faster to store
+  anyway).
+*/
+static VALUE dvector_fast_fancy_read(VALUE self, VALUE stream, VALUE options)
+{
+  /* First, we read up options: */
+  double def = rb_num2dbl(rb_hash_aref(options, 
+				       rb_str_new2("default")));
+  int remove_space = RTEST(rb_hash_aref(options, 
+					rb_str_new2("remove_space")));
+  int index_col = RTEST(rb_hash_aref(options, 
+				     rb_str_new2("index_col")));
+  long skip_first = FIX2LONG(rb_hash_aref(options, 
+					  rb_str_new2("skip_first")));
+  VALUE sep = rb_hash_aref(options, rb_str_new2("sep"));
+  VALUE comments = rb_hash_aref(options, rb_str_new2("comments"));
+
+  /* Then, some various variables: */
+  VALUE line;
+
+  ID chomp_id = rb_intern("chomp!");
+  ID gets_id = rb_intern("gets");
+  long line_number = 0;
+
+  /* 
+     Now come the fun part - rudimentary vectors management 
+   */
+  int nb_vectors = 0;		/* The number of vectors currently created */
+  int current_size = 10;	/* The number of slots available */
+  double ** vectors = ALLOC_N(double *, current_size);
+  long index = 0;		/* The current index in the vectors */
+  int allocated_size = 5004;	/* The size available in the vectors */
+
+
+  int i;
+
+  /* The return value */
+  VALUE ary;
+
+  /* We use a real gets so we can also rely on StringIO, for instance */
+  while(RTEST(line = rb_funcall(stream, gets_id, 0))) {
+    VALUE pre, post, match;
+    const char * line_ptr;
+    int col = 0;
+    line_number++;
+    /* Whether we should skip the line... */
+    if(skip_first >= line_number)
+      continue;
+
+    /* We check for a blank line using isspace: */
+    line_ptr = StringValueCStr(line);
+    while(line_ptr && *line_ptr) {
+      if(! isspace(*line_ptr))
+	break;
+      line_ptr++;
+    }
+    if(! *line_ptr)
+      continue;			/* We found a blank line  */
+    if(remove_space)		/* We replace the contents of the line  */
+      line = rb_str_new2(line_ptr);
+
+    /* ... or a comment line */
+    if(RTEST(comments) && RTEST(rb_reg_match(comments, line))) 
+      continue;
+
+    /* Then, we remove the newline: */
+    post = line;
+    rb_funcall(post, chomp_id, 0);
+
+    /* We iterate over the different portions between
+       matches
+    */
+    while(RTEST(post)) {
+      const char * a;
+      char * b;
+      if(RTEST(rb_reg_match(sep, post))) {
+	match = rb_gv_get("$~");
+	pre = rb_reg_match_pre(match);
+	post = rb_reg_match_post(match);
+      }
+      else {
+	pre = post;
+	post = Qnil;
+      }
+      a = StringValueCStr(pre);
+      double c = strtod(a, &b);
+      if(b == a) 
+	c = def;
+      if(col >= nb_vectors) {
+	nb_vectors++;
+	/* We need to create a new vector */
+	if(col >= current_size) { /* Increase the available size */
+	  current_size += 5;
+	  REALLOC_N(vectors, double * , current_size);
+	}
+	
+	double * vals = vectors[col] = ALLOC_N(double, allocated_size);
+	/* Filling it with the default value */
+	for(i = 0; i < index; i++) {
+	  vals[i] = def;
+	}
+      }
+      vectors[col][index] = c;
+      col++;
+    }
+    /* Now, we finish the line */
+    for(; col < nb_vectors; col++)
+      vectors[col][index] = def;
+    index++;
+    /* Now, we reallocate memory if necessary */
+    if(index >= allocated_size) {
+      allocated_size *= 2;	/* We double the size */
+      for(col = 0; col < nb_vectors; col++)
+	REALLOC_N(vectors[col], double, allocated_size);
+    }
+  }
+  /* Now, we make up the array */
+  ary = rb_ary_new();
+  for(i = 0; i < nb_vectors; i++) {
+    /* We create a vector */
+    rb_ary_store(ary, i, make_dvector_from_data(cDvector, index, vectors[i]));
+    /* And free the memory */
+    free(vectors[i]);
+  }
+  free(vectors);
+  return ary;
+}
+
+
 /* 
  * Document-class: Dobjects::Dvector
  *
  * Dvectors are a specialized implementation of one-dimensional arrays of double precision floating point numbers. 
  * They are intended for use in applications needing efficient processing of large vectors of numeric data.
@@ -5234,11 +5470,12 @@
    rb_define_singleton_method(cDvector, "max_of_many", dvector_max_of_many, 1);
 
    rb_define_singleton_method(cDvector, "is_a_dvector", dvector_is_a_dvector, 1);
 
    
-   rb_define_method(cDvector, "make_bezier_control_points_for_cubic_in_x", dvector_make_bezier_control_points_for_cubic_in_x, 6);
+   rb_define_method(cDvector, "make_bezier_control_points_for_cubic_in_x", 
+      dvector_make_bezier_control_points_for_cubic_in_x, 6);
    
    rb_define_method(cDvector, "initialize", dvector_initialize, -1);
    rb_define_method(cDvector, "initialize_copy", dvector_replace, 1);
    
    rb_define_method(cDvector, "tridag", dvector_tridag, 4);
@@ -5472,9 +5709,14 @@
    rb_define_method(cDvector, "_dump", dvector_dump, 1);
    rb_define_singleton_method(cDvector, "_load", dvector_load, 1);
 
    /* simple convolution */
    rb_define_method(cDvector, "convolve", dvector_convolve, 2);
+
+   /* Fast fancy read: */
+   rb_define_singleton_method(cDvector, "fast_fancy_read", 
+			      dvector_fast_fancy_read, 2);
+
 
    dvector_output_fs = Qnil;
    rb_global_variable(&dvector_output_fs);
    dvector_output_fs = rb_str_new2(" ");