split/Dvector/dvector.c in tioga-1.6 vs split/Dvector/dvector.c in tioga-1.7

- old
+ new

@@ -308,10 +308,27 @@ /* we set dirty to 0 */ d->dirty = 0; return ary; } +/* Makes a Dvector with the given data. No additional capacity. */ +PRIVATE VALUE make_dvector_from_data(VALUE klass, long len, double * data) { + VALUE ary = dvector_alloc(klass); + Dvector *d = Get_Dvector(ary); + if (len < 0) { + rb_raise(rb_eArgError, "negative dvector size (or size too big)"); + } + d->len = len; + if (len == 0) len++; + d->ptr = ALLOC_N(double, len); + MEMCPY(d->ptr, data, double, len); + d->capa = len; + /* we set dirty to 0 */ + d->dirty = 0; + return ary; +} + PRIVATE VALUE dvector_new2(long len, long capa) { return make_new_dvector(cDvector, len, capa); } PRIVATE VALUE dvector_new() { @@ -1428,11 +1445,11 @@ StringValue(sep); len += RSTRING_LEN(sep) * (d->len - 1); /* So it works for ruby 1.9 */ } result = rb_str_buf_new(len); for (i=0; i < d->len; i++) { - sprintf(buff, "%g", d->ptr[i]); + snprintf(buff,sizeof(buff), "%g", d->ptr[i]); tmp = rb_str_new2(buff); if (i > 0 && !NIL_P(sep)) rb_str_buf_append(result, sep); rb_str_buf_append(result, tmp); } if (taint) OBJ_TAINT(result); @@ -2095,23 +2112,16 @@ * a -> Dvector[ -1, -2, -3 ] */ VALUE dvector_replace(VALUE dest, VALUE orig) { VALUE shared; Dvector *org, *d; - dvector_modify(dest); + dvector_modify(dest); // take care of any sharing issues. orig = dvector_to_dvector(orig); /* it might be some kind of Array rather than a Dvector */ if (dest == orig) return dest; org = Get_Dvector(orig); d = Get_Dvector(dest); - if (d->ptr) { - if (0 && d->capa >= org->len && d->shared == Qnil) { - d->len = org->len; - MEMCPY(d->ptr, org->ptr, double, d->len); - return dest; - } - free(d->ptr); - } + if (d->ptr) free(d->ptr); // we know it isn't shared because we did dvector_modify above shared = dvector_make_shared(orig); org = Get_Dvector(shared); d->ptr = org->ptr; d->len = org->len; d->shared = shared; @@ -4319,20 +4329,63 @@ */ VALUE dvector_div_bang(VALUE ary, VALUE arg) { return dvector_apply_math_op2_bang(ary, arg, do_div); } +static char *fill_read_buffer(char **buff_ptr, int *len_ptr, FILE *file) { + char *buff, *new_buff; + buff = *buff_ptr; + int len, i, max_tries, line_len; + long filepos = ftell(file); + if (filepos == -1) { + printf("ftell failed\n"); + return NULL; + } + max_tries = 10; + for (i = 0; i < max_tries; i++) { + len = *len_ptr; + buff[len-1] = '1'; // mark the last character position + buff = fgets(buff, len, file); + if (buff == NULL) return NULL; // end of file + if (buff[len-1] != '\0') { + if (0) { + line_len = strlen(buff); + printf("len %i line_len %i\n", len, line_len); + if (line_len < 80) { + printf("line buff contains: %s\n", buff); + } else { + printf("line buff ends with: %s\n", buff+line_len-80); + } + } + return buff; + } + // ran out of room -- make buffer larger and try again + len = 10*len + 100; + //printf("fill_read_buffer ran out of room -- increase buffer len to %i and try again\n", len); + *len_ptr = len; + new_buff = (char *)realloc(buff, len); + if (new_buff == NULL) break; + buff = new_buff; + *buff_ptr = buff; + if (fseek(file, filepos, SEEK_SET) != 0) { + printf("fseek failed\n"); + return NULL; + } + } + return NULL; +} + PRIVATE /*======================================================================*/ VALUE Read_Dvectors(char *filename, VALUE destinations, int first_row_of_file, int number_of_rows) { FILE *file = NULL; VALUE col_obj, cols_obj, *cols_ptr = NULL; Dvector *d; double v; int last_row_of_file; - const int buff_len = 10000; - char buff[buff_len], *num_str, *pend, c, *cptr; + int buff_len = 100; + char *buff, *num_str, *pend, c, *cptr; int num_cols = 0, i, row, col, buff_loc, skip = first_row_of_file - 1; last_row_of_file = (number_of_rows == -1)? -1 : first_row_of_file + number_of_rows - 1; if ((last_row_of_file != -1 && last_row_of_file < first_row_of_file) || filename == NULL) return false; if (destinations != Qnil) { cols_obj = rb_Array(destinations); @@ -4353,19 +4406,25 @@ } } if ((file=fopen(filename,"r")) == NULL) { rb_raise(rb_eArgError, "ERROR: read cannot open %s", filename); } + buff = (char *)malloc(buff_len); + if (buff == NULL) { + fclose(file); + rb_raise(rb_eArgError, "ERROR: allocation of read buffer failed"); + } for (i = 0; i < skip; i++) { /* skip over initial lines */ - if (fgets(buff, buff_len, file)==NULL) { + if (fill_read_buffer(&buff, &buff_len, file)==NULL) { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: read reached end of file before reaching line %i in %s", first_row_of_file, filename); } } for (row = 0, i = first_row_of_file; last_row_of_file == -1 || i <= last_row_of_file; row++, i++) { - if (fgets(buff, buff_len, file)==NULL) break; /* have reached end of file */ + if (fill_read_buffer(&buff, &buff_len, file)==NULL) break; /* have reached end of file */ if (destinations == Qnil) { /* create destinations */ buff_loc = 0; while (true) { while (isspace(buff[buff_loc])) buff_loc++; /* skip leading blanks */ if (buff[buff_loc] == '\0') break; @@ -4384,16 +4443,18 @@ buff_loc = 0; for (col = 0; col < num_cols; col++) { while (isspace(buff[buff_loc])) buff_loc++; /* skip leading blanks */ if (buff[buff_loc] == '\0') { fclose(file); + free(buff); rb_raise(rb_eArgError, "read reached end of line looking for column %i in line %i of %s", col+1, i, filename); } num_str = buff+buff_loc; while (isgraph(buff[buff_loc])) buff_loc++; /* include non-blanks */ if (buff[buff_loc] == '\0') { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: read reached end of line looking for column %i in line %i of %s", col+1, i, filename); } col_obj = cols_ptr[col]; if (col_obj == Qnil) continue; Data_Get_Struct(col_obj, Dvector, d); @@ -4408,18 +4469,20 @@ cptr = pend+5; c = *cptr; pend[5] = ' '; pend[4] = pend[3]; pend[3] = pend[2]; pend[2] = pend[1]; pend[1] = pend[0]; pend[0] = 'E'; v = strtod(num_str,&pend); *cptr = c; buff_loc = pend - buff; } else { fclose(file); + free(buff); pend[0] = 0; rb_raise(rb_eArgError, "ERROR: unreadable value in file %s in line %i: %s", filename, i , buff+buff_loc); } } } if (!is_okay_number(v)) { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: bad value %g in line %i of %s -- %s", v, i, filename, num_str); } if (row >= d->capa) Dvector_Store_Double(col_obj, row, v); else { @@ -4428,10 +4491,11 @@ d->ptr[row] = v; } } } fclose(file); + free(buff); return destinations; } PRIVATE /* @@ -4465,12 +4529,12 @@ VALUE Read_Rows_of_Dvectors(char *filename, VALUE destinations, int first_row_of_file) { FILE *file = NULL; VALUE row_obj, rows_obj, *rows_ptr = NULL; Dvector *d; double v, *row_data; - const int buff_len = 10000; - char buff[buff_len], *num_str, *pend, c, *cptr; + int buff_len = 1000; + char *buff, *num_str, *pend, c, *cptr; int num_rows = 0, i, row, col, buff_loc, c_loc, skip = first_row_of_file - 1; rows_obj = rb_Array(destinations); num_rows = RARRAY(rows_obj)->len; rows_ptr = RARRAY(rows_obj)->ptr; for (i = 0; i < num_rows; i++) { /* first pass to check validity */ @@ -4487,20 +4551,27 @@ d->len = 0; } if ((file=fopen(filename,"r")) == NULL) { rb_raise(rb_eArgError, "ERROR: read_rows cannot open %s", filename); } + buff = (char *)malloc(buff_len); + if (buff == NULL) { + fclose(file); + rb_raise(rb_eArgError, "ERROR: allocation of read buffer failed"); + } for (i = 0; i < skip; i++) { /* skip over initial lines */ - if (fgets(buff, buff_len, file)==NULL) { + if (fill_read_buffer(&buff, &buff_len, file)==NULL) { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: read_rows reached end of file before reaching line %i in %s", first_row_of_file, filename); } } for (row = 0, i = first_row_of_file; row < num_rows; row++, i++) { - if (fgets(buff, buff_len, file)==NULL) { + if (fill_read_buffer(&buff, &buff_len, file)==NULL) { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: read_rows reached end of file at line %i in %s", i, filename); } row_obj = rows_ptr[row]; if (row_obj == Qnil) continue; d = Get_Dvector(row_obj); @@ -4527,10 +4598,11 @@ } } if (!is_okay_number(v)) { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: bad value %g in line i% of file %s", v, i, filename); } if (col < d->capa) { row_data[col] = v; d->len = col+1; } else { Dvector_Store_Double(row_obj, col, v); @@ -4542,10 +4614,11 @@ REALLOC_N(d->ptr, double, col); d->capa = col; } } fclose(file); + free(buff); return destinations; } PRIVATE @@ -4567,34 +4640,42 @@ rb_raise(rb_eArgError, "wrong # of arguments(%d) for read_rows", argc); if (argc > 2) arg3 = NUM2INT(argv[2]); return Read_Rows_of_Dvectors(StringValueCStr(argv[0]),argv[1],arg3); klass = Qnil; } + PRIVATE VALUE Read_Row(char *filename, int row, VALUE row_ary) { FILE *file = NULL; - const int buff_len = 10000; - char buff[buff_len], *num_str, *pend, c, *cptr; + int buff_len = 1000; + char *buff, *num_str, *pend, c, *cptr; int i, col, buff_loc; double v; if (row <= 0) { rb_raise(rb_eArgError, "ERROR: read_row line must be positive (not %i) for file %s", row, filename); } if (filename == NULL || (file=fopen(filename,"r")) == NULL) { rb_raise(rb_eArgError, "ERROR: read_row cannot open %s", filename); } + buff = (char *)malloc(buff_len); + if (buff == NULL) { + fclose(file); + rb_raise(rb_eArgError, "ERROR: allocation of read buffer failed"); + } for (i = 0; i < row; i++) { /* read lines until reach desired row */ - if (fgets(buff, buff_len, file)==NULL) { + if (fill_read_buffer(&buff, &buff_len, file)==NULL) { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: read_row reached end of file before reaching line %i in %s", row, filename); } } if (row_ary == Qnil) row_ary = dvector_new(); else if (is_a_dvector(row_ary)) dvector_clear(row_ary); else { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: destination for read_row must be a Dvector"); } buff_loc = 0; for (col = 0; ; col++) { while (isspace(buff[buff_loc])) buff_loc++; /* skip leading blanks */ @@ -4612,23 +4693,26 @@ cptr = pend+5; c = *cptr; pend[5] = ' '; pend[4] = pend[3]; pend[3] = pend[2]; pend[2] = pend[1]; pend[1] = pend[0]; pend[0] = 'E'; v = strtod(num_str,&pend); *cptr = c; buff_loc = pend - buff; } else { fclose(file); + free(buff); pend[0] = 0; rb_raise(rb_eArgError, "ERROR: unreadable value in file %s in line %i: %s", filename, i , buff+buff_loc); } } } if (!is_okay_number(v)) { fclose(file); + free(buff); rb_raise(rb_eArgError, "ERROR: bad value %g in line %i of file %s", v, i, filename); } Dvector_Store_Double(row_ary, col, v); } fclose(file); + free(buff); return row_ary; } PRIVATE /* @@ -5159,10 +5243,162 @@ } } return retval; } + +/* + :call-seq: + Dvector.fast_fancy_read(stream, options) => Array_of_Dvectors + + Reads data from an IO stream and separate it into columns of data + according to the _options_, a hash holding the following elements + (compulsory, but you can use FANCY_READ_DEFAULTS): + * 'sep': a regular expression that separate the entries + * 'comments': any line matching this will be skipped + * 'skip_first': skips that many lines before reading anything + * 'index_col': if true, the first column returned contains the + number of the line read + * 'remove_space': whether to remove spaces at the beginning of a line. *This + option is currently not implemented !* + * 'default': what to put when nothing was found but a number must be used + + As a side note, the read time is highly non-linear, which suggests that + the read is memory-allocation/copying-limited, at least for big files. + Well, the read time is non-linear for + + + An internal memory allocation with aggressive policy should solve that, + that is, not using directly Dvectors (and it would be way faster to store + anyway). +*/ +static VALUE dvector_fast_fancy_read(VALUE self, VALUE stream, VALUE options) +{ + /* First, we read up options: */ + double def = rb_num2dbl(rb_hash_aref(options, + rb_str_new2("default"))); + int remove_space = RTEST(rb_hash_aref(options, + rb_str_new2("remove_space"))); + int index_col = RTEST(rb_hash_aref(options, + rb_str_new2("index_col"))); + long skip_first = FIX2LONG(rb_hash_aref(options, + rb_str_new2("skip_first"))); + VALUE sep = rb_hash_aref(options, rb_str_new2("sep")); + VALUE comments = rb_hash_aref(options, rb_str_new2("comments")); + + /* Then, some various variables: */ + VALUE line; + + ID chomp_id = rb_intern("chomp!"); + ID gets_id = rb_intern("gets"); + long line_number = 0; + + /* + Now come the fun part - rudimentary vectors management + */ + int nb_vectors = 0; /* The number of vectors currently created */ + int current_size = 10; /* The number of slots available */ + double ** vectors = ALLOC_N(double *, current_size); + long index = 0; /* The current index in the vectors */ + int allocated_size = 5004; /* The size available in the vectors */ + + + int i; + + /* The return value */ + VALUE ary; + + /* We use a real gets so we can also rely on StringIO, for instance */ + while(RTEST(line = rb_funcall(stream, gets_id, 0))) { + VALUE pre, post, match; + const char * line_ptr; + int col = 0; + line_number++; + /* Whether we should skip the line... */ + if(skip_first >= line_number) + continue; + + /* We check for a blank line using isspace: */ + line_ptr = StringValueCStr(line); + while(line_ptr && *line_ptr) { + if(! isspace(*line_ptr)) + break; + line_ptr++; + } + if(! *line_ptr) + continue; /* We found a blank line */ + if(remove_space) /* We replace the contents of the line */ + line = rb_str_new2(line_ptr); + + /* ... or a comment line */ + if(RTEST(comments) && RTEST(rb_reg_match(comments, line))) + continue; + + /* Then, we remove the newline: */ + post = line; + rb_funcall(post, chomp_id, 0); + + /* We iterate over the different portions between + matches + */ + while(RTEST(post)) { + const char * a; + char * b; + if(RTEST(rb_reg_match(sep, post))) { + match = rb_gv_get("$~"); + pre = rb_reg_match_pre(match); + post = rb_reg_match_post(match); + } + else { + pre = post; + post = Qnil; + } + a = StringValueCStr(pre); + double c = strtod(a, &b); + if(b == a) + c = def; + if(col >= nb_vectors) { + nb_vectors++; + /* We need to create a new vector */ + if(col >= current_size) { /* Increase the available size */ + current_size += 5; + REALLOC_N(vectors, double * , current_size); + } + + double * vals = vectors[col] = ALLOC_N(double, allocated_size); + /* Filling it with the default value */ + for(i = 0; i < index; i++) { + vals[i] = def; + } + } + vectors[col][index] = c; + col++; + } + /* Now, we finish the line */ + for(; col < nb_vectors; col++) + vectors[col][index] = def; + index++; + /* Now, we reallocate memory if necessary */ + if(index >= allocated_size) { + allocated_size *= 2; /* We double the size */ + for(col = 0; col < nb_vectors; col++) + REALLOC_N(vectors[col], double, allocated_size); + } + } + /* Now, we make up the array */ + ary = rb_ary_new(); + for(i = 0; i < nb_vectors; i++) { + /* We create a vector */ + rb_ary_store(ary, i, make_dvector_from_data(cDvector, index, vectors[i])); + /* And free the memory */ + free(vectors[i]); + } + free(vectors); + return ary; +} + + /* * Document-class: Dobjects::Dvector * * Dvectors are a specialized implementation of one-dimensional arrays of double precision floating point numbers. * They are intended for use in applications needing efficient processing of large vectors of numeric data. @@ -5234,11 +5470,12 @@ rb_define_singleton_method(cDvector, "max_of_many", dvector_max_of_many, 1); rb_define_singleton_method(cDvector, "is_a_dvector", dvector_is_a_dvector, 1); - rb_define_method(cDvector, "make_bezier_control_points_for_cubic_in_x", dvector_make_bezier_control_points_for_cubic_in_x, 6); + rb_define_method(cDvector, "make_bezier_control_points_for_cubic_in_x", + dvector_make_bezier_control_points_for_cubic_in_x, 6); rb_define_method(cDvector, "initialize", dvector_initialize, -1); rb_define_method(cDvector, "initialize_copy", dvector_replace, 1); rb_define_method(cDvector, "tridag", dvector_tridag, 4); @@ -5472,9 +5709,14 @@ rb_define_method(cDvector, "_dump", dvector_dump, 1); rb_define_singleton_method(cDvector, "_load", dvector_load, 1); /* simple convolution */ rb_define_method(cDvector, "convolve", dvector_convolve, 2); + + /* Fast fancy read: */ + rb_define_singleton_method(cDvector, "fast_fancy_read", + dvector_fast_fancy_read, 2); + dvector_output_fs = Qnil; rb_global_variable(&dvector_output_fs); dvector_output_fs = rb_str_new2(" ");