#include #include // CSV specifications. // http://tools.ietf.org/html/rfc4180 // http://w3c.github.io/csvw/syntax/#ebnf // CSV implementation. // https://github.com/ruby/ruby/blob/master/lib/csv.rb // Ruby C extensions help. // https://github.com/ruby/ruby/blob/trunk/README.EXT // http://rxr.whitequark.org/mri/source // Ragel help. // https://www.mail-archive.com/ragel-users@complang.org/ #define ENCODE \ if (enc2 != NULL) { \ field = rb_str_encode(field, rb_enc_from_encoding(enc), 0, Qnil); \ } static VALUE mModule, rb_eParseError; static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string, s_encoding; %%{ machine fastcsv; action new_line { curline++; } action open_quote { unclosed_line = curline; } action close_quote { unclosed_line = 0; } action read_unquoted { if (p == ts) { // Unquoted empty fields are nil, not "", in Ruby. field = Qnil; } else if (p > ts) { field = rb_enc_str_new(ts, p - ts, encoding); ENCODE; } } action read_quoted { if (p == ts) { field = rb_enc_str_new("", 0, encoding); ENCODE; } // @note If we add an action on '""', we can skip some steps if no '""' is found. else if (p > ts) { // Operating on ts in-place produces odd behavior, FYI. char *copy = ALLOC_N(char, p - ts); memcpy(copy, ts, p - ts); char *reader = ts, *writer = copy; int escaped = 0; while (p > reader) { if (*reader == quote_char && !escaped) { // Skip the escaping character. escaped = 1; } else { escaped = 0; *writer++ = *reader; } reader++; } field = rb_enc_str_new(copy, writer - copy, enc); ENCODE; if (copy != NULL) { free(copy); } } } action new_field { rb_ary_push(row, field); field = Qnil; } action new_row { if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field rb_ary_push(row, field); field = Qnil; } rb_yield(row); row = rb_ary_new(); } action last_row { if (!NIL_P(field) || RARRAY_LEN(row)) { rb_ary_push(row, field); } if (RARRAY_LEN(row)) { rb_yield(row); } } EOF = 0 >last_row; quote_char = '"'; col_sep = ',' >new_field; row_sep = ('\r' '\n'? | '\n') @new_line; unquoted = (any* -- quote_char -- col_sep -- row_sep - EOF) %read_unquoted; quoted = quote_char >open_quote (any - quote_char - EOF | quote_char quote_char | row_sep)* %read_quoted quote_char >close_quote; field = unquoted | quoted; # @see Ragel Guide: 6.3 Scanners # An unquoted field can be zero-length. main := |* field col_sep EOF?; field row_sep >new_row EOF?; field EOF; *|; }%% %% write data; // 16 kB #define BUFSIZE 16384 // @see http://rxr.whitequark.org/mri/source/io.c#4845 static void rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode) { int default_ext = 0; if (ext == NULL) { ext = rb_default_external_encoding(); default_ext = 1; } if (ext == rb_ascii8bit_encoding()) { /* If external is ASCII-8BIT, no transcoding */ intern = NULL; } else if (intern == NULL) { intern = rb_default_internal_encoding(); } if (intern == NULL || intern == (rb_encoding *)Qnil || intern == ext) { /* No internal encoding => use external + no transcoding */ *enc = (default_ext && intern != ext) ? NULL : ext; *enc2 = NULL; } else { *enc = intern; *enc2 = ext; } } VALUE fastcsv(int argc, VALUE *argv, VALUE self) { int cs, act, have = 0, curline = 1, io = 0; char *ts = 0, *te = 0, *buf = 0, *eof = 0; VALUE port, opts; VALUE row = rb_ary_new(), field = Qnil, bufsize = Qnil; int done = 0, unclosed_line = 0, buffer_size = 0, taint = 0; rb_encoding *enc = NULL, *enc2 = NULL, *encoding = NULL; VALUE r_encoding; VALUE option; char quote_char = '"'; rb_scan_args(argc, argv, "11", &port, &opts); taint = OBJ_TAINTED(port); io = rb_respond_to(port, s_read); if (!io) { if (rb_respond_to(port, s_to_str)) { port = rb_funcall(port, s_to_str, 0); StringValue(port); } else { rb_raise(rb_eArgError, "data has to respond to #read or #to_str"); } } if (NIL_P(opts)) { opts = rb_hash_new(); } else if (TYPE(opts) != T_HASH) { rb_raise(rb_eArgError, "options has to be a Hash or nil"); } // @see rb_io_extract_modeenc /* Set to defaults */ rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2, 0); // "enc" (internal) or "enc2:enc" (external:internal) or "enc:-" (external). // We don't support binmode, which would force "ASCII-8BIT", or "BOM|UTF-*". // @see http://ruby-doc.org/core-2.1.1/IO.html#method-c-new-label-Open+Mode option = rb_hash_aref(opts, ID2SYM(rb_intern("encoding"))); if (TYPE(option) == T_STRING) { // parse_mode_enc is not in header file. const char *estr = StringValueCStr(option), *ptr; char encname[ENCODING_MAXNAMELEN+1]; int idx, idx2; rb_encoding *ext_enc, *int_enc; /* parse estr as "enc" or "enc2:enc" or "enc:-" */ ptr = strrchr(estr, ':'); if (ptr) { long len = (ptr++) - estr; if (len == 0 || len > ENCODING_MAXNAMELEN) { idx = -1; } else { memcpy(encname, estr, len); encname[len] = '\0'; estr = encname; idx = rb_enc_find_index(encname); } } else { idx = rb_enc_find_index(estr); } if (idx >= 0) { ext_enc = rb_enc_from_index(idx); } else { if (idx != -2) { // `unsupported_encoding` is not in header file. rb_warn("Unsupported encoding %s ignored", estr); } ext_enc = NULL; } int_enc = NULL; if (ptr) { if (*ptr == '-' && *(ptr+1) == '\0') { /* Special case - "-" => no transcoding */ int_enc = (rb_encoding *)Qnil; } else { idx2 = rb_enc_find_index(ptr); if (idx2 < 0) { // `unsupported_encoding` is not in header file. rb_warn("Unsupported encoding %s ignored", ptr); } else if (idx2 == idx) { int_enc = (rb_encoding *)Qnil; } else { int_enc = rb_enc_from_index(idx2); } } } rb_io_ext_int_to_encs(ext_enc, int_enc, &enc, &enc2, 0); } else if (!NIL_P(option)) { rb_raise(rb_eArgError, ":encoding has to be a String"); } // @see https://github.com/ruby/ruby/blob/70510d026f8d86693dccaba07417488eed09b41d/lib/csv.rb#L1567 // @see https://github.com/ruby/ruby/blob/70510d026f8d86693dccaba07417488eed09b41d/lib/csv.rb#L2300 if (rb_respond_to(port, s_internal_encoding)) { r_encoding = rb_funcall(port, s_internal_encoding, 0); if (NIL_P(r_encoding)) { r_encoding = rb_funcall(port, s_external_encoding, 0); } } else if (rb_respond_to(port, s_string)) { r_encoding = rb_funcall(rb_funcall(port, s_string, 0), s_encoding, 0); } else if (rb_respond_to(port, s_encoding)) { r_encoding = rb_funcall(port, s_encoding, 0); } else { r_encoding = rb_enc_from_encoding(rb_ascii8bit_encoding()); } if (NIL_P(r_encoding)) { r_encoding = rb_enc_from_encoding(rb_default_internal_encoding()); } if (NIL_P(r_encoding)) { r_encoding = rb_enc_from_encoding(rb_default_external_encoding()); } if (enc2 != NULL) { encoding = enc2; } else if (enc != NULL) { encoding = enc; } else if (!NIL_P(r_encoding)) { encoding = rb_enc_get(r_encoding); } buffer_size = BUFSIZE; if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) { bufsize = rb_ivar_get(self, rb_intern("@buffer_size")); if (!NIL_P(bufsize)) { buffer_size = NUM2INT(bufsize); } } if (io) { buf = ALLOC_N(char, buffer_size); } %% write init; while (!done) { VALUE str; char *p, *pe; int len, space = buffer_size - have, tokstart_diff, tokend_diff; if (io) { if (space == 0) { tokstart_diff = ts - buf; tokend_diff = te - buf; buffer_size += BUFSIZE; REALLOC_N(buf, char, buffer_size); space = buffer_size - have; ts = buf + tokstart_diff; te = buf + tokend_diff; } p = buf + have; str = rb_funcall(port, s_read, 1, INT2FIX(space)); if (NIL_P(str)) { // StringIO#read returns nil for empty string. len = 0; } else { len = RSTRING_LEN(str); memcpy(p, StringValuePtr(str), len); } if (len < space) { // EOF actions don't work in scanners, so we add a sentinel value. // @see http://www.complang.org/pipermail/ragel-users/2007-May/001516.html // @see https://github.com/leeonix/lua-csv-ragel/blob/master/src/csv.rl p[len++] = 0; done = 1; } } else { p = RSTRING_PTR(port); len = RSTRING_LEN(port); p[len++] = 0; done = 1; } pe = p + len; %% write exec; if (done && cs < fastcsv_first_final) { if (buf != NULL) { free(buf); } if (unclosed_line) { rb_raise(rb_eParseError, "Unclosed quoted field on line %d.", unclosed_line); } // Ruby raises different errors for illegal quoting, depending on whether // a quoted string is followed by a string ("Unclosed quoted field on line // %d.") or by a string ending in a quote ("Missing or stray quote in line // %d"). These precisions are kind of bogus, but we can try using $!. else { rb_raise(rb_eParseError, "Illegal quoting in line %d.", curline); } } if (ts == 0) { have = 0; } else if (io) { have = pe - ts; memmove(buf, ts, have); te = buf + (te - ts); ts = buf; } } if (buf != NULL) { free(buf); } return Qnil; } void Init_fastcsv() { s_read = rb_intern("read"); s_to_str = rb_intern("to_str"); s_internal_encoding = rb_intern("internal_encoding"); s_external_encoding = rb_intern("external_encoding"); s_string = rb_intern("string"); s_encoding = rb_intern("encoding"); mModule = rb_define_module("FastCSV"); rb_define_attr(rb_singleton_class(mModule), "buffer_size", 1, 1); rb_define_singleton_method(mModule, "raw_parse", fastcsv, -1); rb_eParseError = rb_define_class_under(mModule, "ParseError", rb_eStandardError); }