ext/fastcsv/fastcsv.c in fastcsv-0.0.1 vs ext/fastcsv/fastcsv.c in fastcsv-0.0.2
- old
+ new
@@ -14,51 +14,76 @@
// http://rxr.whitequark.org/mri/source
// Ragel help.
// https://www.mail-archive.com/ragel-users@complang.org/
-# define ASSOCIATE_INDEX \
- if (internal_index >= 0) { \
- rb_enc_associate_index(field, internal_index); \
- field = rb_str_encode(field, rb_enc_from_encoding(external_encoding), 0, Qnil); \
- } \
- else { \
- rb_enc_associate_index(field, rb_enc_to_index(external_encoding)); \
- }
+#define ENCODE \
+if (enc2 != NULL) { \
+ field = rb_str_encode(field, rb_enc_from_encoding(enc), 0, Qnil); \
+}
static VALUE mModule, rb_eParseError;
-static ID s_read, s_to_str;
+static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string, s_encoding;
-#line 139 "ext/fastcsv/fastcsv.rl"
+#line 125 "ext/fastcsv/fastcsv.rl"
-#line 37 "ext/fastcsv/fastcsv.c"
+#line 33 "ext/fastcsv/fastcsv.c"
static const int fastcsv_start = 4;
static const int fastcsv_first_final = 4;
static const int fastcsv_error = 0;
static const int fastcsv_en_main = 4;
-#line 142 "ext/fastcsv/fastcsv.rl"
+#line 128 "ext/fastcsv/fastcsv.rl"
+// 16 kB
#define BUFSIZE 16384
+// @see http://rxr.whitequark.org/mri/source/io.c#4845
+static void
+rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode)
+{
+ int default_ext = 0;
+
+ if (ext == NULL) {
+ ext = rb_default_external_encoding();
+ default_ext = 1;
+ }
+ if (ext == rb_ascii8bit_encoding()) {
+ /* If external is ASCII-8BIT, no transcoding */
+ intern = NULL;
+ }
+ else if (intern == NULL) {
+ intern = rb_default_internal_encoding();
+ }
+ if (intern == NULL || intern == (rb_encoding *)Qnil || intern == ext) {
+ /* No internal encoding => use external + no transcoding */
+ *enc = (default_ext && intern != ext) ? NULL : ext;
+ *enc2 = NULL;
+ }
+ else {
+ *enc = intern;
+ *enc2 = ext;
+ }
+}
+
VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
int cs, act, have = 0, curline = 1, io = 0;
char *ts = 0, *te = 0, *buf = 0, *eof = 0;
VALUE port, opts;
VALUE row = rb_ary_new(), field = Qnil, bufsize = Qnil;
int done = 0, unclosed_line = 0, buffer_size = 0, taint = 0;
- int internal_index = 0, external_index = rb_enc_to_index(rb_default_external_encoding());
- rb_encoding *external_encoding = rb_default_external_encoding();
+ rb_encoding *enc = NULL, *enc2 = NULL, *encoding = NULL;
+ VALUE r_encoding;
VALUE option;
- char quote_char = '"'; //, *col_sep = ",", *row_sep = "\r\n";
+ char quote_char = '"';
rb_scan_args(argc, argv, "11", &port, &opts);
taint = OBJ_TAINTED(port);
io = rb_respond_to(port, s_read);
if (!io) {
@@ -76,80 +101,115 @@
}
else if (TYPE(opts) != T_HASH) {
rb_raise(rb_eArgError, "options has to be a Hash or nil");
}
- // @note Add machines for common CSV dialects, or see if we can use "when"
- // from Chapter 6 to compare the character to the host program's variable.
- // option = rb_hash_aref(opts, ID2SYM(rb_intern("quote_char")));
- // if (TYPE(option) == T_STRING && RSTRING_LEN(option) == 1) {
- // quote_char = *StringValueCStr(option);
- // }
- // else if (!NIL_P(option)) {
- // rb_raise(rb_eArgError, ":quote_char has to be a single character String");
- // }
+ // @see rb_io_extract_modeenc
+ /* Set to defaults */
+ rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2, 0);
- // option = rb_hash_aref(opts, ID2SYM(rb_intern("col_sep")));
- // if (TYPE(option) == T_STRING) {
- // col_sep = StringValueCStr(option);
- // }
- // else if (!NIL_P(option)) {
- // rb_raise(rb_eArgError, ":col_sep has to be a String");
- // }
-
- // option = rb_hash_aref(opts, ID2SYM(rb_intern("row_sep")));
- // if (TYPE(option) == T_STRING) {
- // row_sep = StringValueCStr(option);
- // }
- // else if (!NIL_P(option)) {
- // rb_raise(rb_eArgError, ":row_sep has to be a String");
- // }
-
+ // "enc" (internal) or "enc2:enc" (external:internal) or "enc:-" (external).
+ // We don't support binmode, which would force "ASCII-8BIT", or "BOM|UTF-*".
+ // @see http://ruby-doc.org/core-2.1.1/IO.html#method-c-new-label-Open+Mode
option = rb_hash_aref(opts, ID2SYM(rb_intern("encoding")));
if (TYPE(option) == T_STRING) {
- // @see parse_mode_enc in Ruby's io.c
- const char *string = StringValueCStr(option), *pointer;
- char internal_encoding_name[ENCODING_MAXNAMELEN + 1];
+ // parse_mode_enc is not in header file.
+ const char *estr = StringValueCStr(option), *ptr;
+ char encname[ENCODING_MAXNAMELEN+1];
+ int idx, idx2;
+ rb_encoding *ext_enc, *int_enc;
- pointer = strrchr(string, ':');
- if (pointer) {
- long len = (pointer++) - string;
+ /* parse estr as "enc" or "enc2:enc" or "enc:-" */
+
+ ptr = strrchr(estr, ':');
+ if (ptr) {
+ long len = (ptr++) - estr;
if (len == 0 || len > ENCODING_MAXNAMELEN) {
- internal_index = -1;
+ idx = -1;
}
else {
- memcpy(internal_encoding_name, string, len);
- internal_encoding_name[len] = '\0';
- string = internal_encoding_name;
- internal_index = rb_enc_find_index(internal_encoding_name);
+ memcpy(encname, estr, len);
+ encname[len] = '\0';
+ estr = encname;
+ idx = rb_enc_find_index(encname);
}
}
else {
- internal_index = rb_enc_find_index(string);
+ idx = rb_enc_find_index(estr);
}
- if (internal_index < 0 && internal_index != -2) {
- rb_warn("Unsupported encoding %s ignored", string);
+ if (idx >= 0) {
+ ext_enc = rb_enc_from_index(idx);
}
+ else {
+ if (idx != -2) {
+ // `unsupported_encoding` is not in header file.
+ rb_warn("Unsupported encoding %s ignored", estr);
+ }
+ ext_enc = NULL;
+ }
- if (pointer) {
- external_index = rb_enc_find_index(pointer);
- if (external_index >= 0) {
- external_encoding = rb_enc_from_index(external_index);
+ int_enc = NULL;
+ if (ptr) {
+ if (*ptr == '-' && *(ptr+1) == '\0') {
+ /* Special case - "-" => no transcoding */
+ int_enc = (rb_encoding *)Qnil;
}
else {
- rb_warn("Unsupported encoding %s ignored", string);
+ idx2 = rb_enc_find_index(ptr);
+ if (idx2 < 0) {
+ // `unsupported_encoding` is not in header file.
+ rb_warn("Unsupported encoding %s ignored", ptr);
+ }
+ else if (idx2 == idx) {
+ int_enc = (rb_encoding *)Qnil;
+ }
+ else {
+ int_enc = rb_enc_from_index(idx2);
+ }
}
}
- else if (internal_index >= 0) {
- external_encoding = rb_enc_from_index(internal_index);
- }
+
+ rb_io_ext_int_to_encs(ext_enc, int_enc, &enc, &enc2, 0);
}
else if (!NIL_P(option)) {
rb_raise(rb_eArgError, ":encoding has to be a String");
}
+ // @see https://github.com/ruby/ruby/blob/70510d026f8d86693dccaba07417488eed09b41d/lib/csv.rb#L1567
+ // @see https://github.com/ruby/ruby/blob/70510d026f8d86693dccaba07417488eed09b41d/lib/csv.rb#L2300
+ if (rb_respond_to(port, s_internal_encoding)) {
+ r_encoding = rb_funcall(port, s_internal_encoding, 0);
+ if (NIL_P(r_encoding)) {
+ r_encoding = rb_funcall(port, s_external_encoding, 0);
+ }
+ }
+ else if (rb_respond_to(port, s_string)) {
+ r_encoding = rb_funcall(rb_funcall(port, s_string, 0), s_encoding, 0);
+ }
+ else if (rb_respond_to(port, s_encoding)) {
+ r_encoding = rb_funcall(port, s_encoding, 0);
+ }
+ else {
+ r_encoding = rb_enc_from_encoding(rb_ascii8bit_encoding());
+ }
+ if (NIL_P(r_encoding)) {
+ r_encoding = rb_enc_from_encoding(rb_default_internal_encoding());
+ }
+ if (NIL_P(r_encoding)) {
+ r_encoding = rb_enc_from_encoding(rb_default_external_encoding());
+ }
+ if (enc2 != NULL) {
+ encoding = enc2;
+ }
+ else if (enc != NULL) {
+ encoding = enc;
+ }
+ else if (!NIL_P(r_encoding)) {
+ encoding = rb_enc_get(r_encoding);
+ }
+
buffer_size = BUFSIZE;
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
if (!NIL_P(bufsize)) {
buffer_size = NUM2INT(bufsize);
@@ -159,19 +219,19 @@
if (io) {
buf = ALLOC_N(char, buffer_size);
}
-#line 165 "ext/fastcsv/fastcsv.c"
+#line 225 "ext/fastcsv/fastcsv.c"
{
cs = fastcsv_start;
ts = 0;
te = 0;
act = 0;
}
-#line 261 "ext/fastcsv/fastcsv.rl"
+#line 311 "ext/fastcsv/fastcsv.rl"
while (!done) {
VALUE str;
char *p, *pe;
int len, space = buffer_size - have, tokstart_diff, tokend_diff;
@@ -215,16 +275,12 @@
p[len++] = 0;
done = 1;
}
pe = p + len;
- // if (done) {
- // // This triggers the eof action in the non-scanner version.
- // eof = pe;
- // }
-#line 226 "ext/fastcsv/fastcsv.c"
+#line 282 "ext/fastcsv/fastcsv.c"
{
if ( p == pe )
goto _test_eof;
switch ( cs )
{
@@ -239,58 +295,58 @@
break;
}
}
goto st4;
tr10:
-#line 105 "ext/fastcsv/fastcsv.rl"
+#line 101 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) {
rb_ary_push(row, field);
}
if (RARRAY_LEN(row)) {
rb_yield(row);
}
}
-#line 129 "ext/fastcsv/fastcsv.rl"
+#line 123 "ext/fastcsv/fastcsv.rl"
{te = p+1;}
goto st4;
tr16:
-#line 129 "ext/fastcsv/fastcsv.rl"
+#line 123 "ext/fastcsv/fastcsv.rl"
{te = p;p--;}
goto st4;
tr17:
-#line 128 "ext/fastcsv/fastcsv.rl"
+#line 122 "ext/fastcsv/fastcsv.rl"
{te = p;p--;}
goto st4;
tr18:
-#line 105 "ext/fastcsv/fastcsv.rl"
+#line 101 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) {
rb_ary_push(row, field);
}
if (RARRAY_LEN(row)) {
rb_yield(row);
}
}
-#line 128 "ext/fastcsv/fastcsv.rl"
+#line 122 "ext/fastcsv/fastcsv.rl"
{te = p+1;}
goto st4;
tr20:
-#line 127 "ext/fastcsv/fastcsv.rl"
+#line 121 "ext/fastcsv/fastcsv.rl"
{te = p;p--;}
goto st4;
tr21:
-#line 105 "ext/fastcsv/fastcsv.rl"
+#line 101 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) {
rb_ary_push(row, field);
}
if (RARRAY_LEN(row)) {
rb_yield(row);
}
}
-#line 127 "ext/fastcsv/fastcsv.rl"
+#line 121 "ext/fastcsv/fastcsv.rl"
{te = p+1;}
goto st4;
st4:
#line 1 "NONE"
{ts = 0;}
@@ -299,11 +355,11 @@
if ( ++p == pe )
goto _test_eof4;
case 4:
#line 1 "NONE"
{ts = p;}
-#line 305 "ext/fastcsv/fastcsv.c"
+#line 361 "ext/fastcsv/fastcsv.c"
switch( (*p) ) {
case 0: goto tr14;
case 10: goto tr3;
case 13: goto tr4;
case 34: goto tr15;
@@ -323,244 +379,244 @@
}
goto st1;
tr2:
#line 1 "NONE"
{te = p+1;}
-#line 44 "ext/fastcsv/fastcsv.rl"
+#line 40 "ext/fastcsv/fastcsv.rl"
{
if (p == ts) {
// Unquoted empty fields are nil, not "", in Ruby.
field = Qnil;
}
else if (p > ts) {
- field = rb_str_new(ts, p - ts);
- ASSOCIATE_INDEX;
+ field = rb_enc_str_new(ts, p - ts, encoding);
+ ENCODE;
}
}
-#line 105 "ext/fastcsv/fastcsv.rl"
+#line 101 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) {
rb_ary_push(row, field);
}
if (RARRAY_LEN(row)) {
rb_yield(row);
}
}
-#line 129 "ext/fastcsv/fastcsv.rl"
+#line 123 "ext/fastcsv/fastcsv.rl"
{act = 3;}
goto st5;
st5:
if ( ++p == pe )
goto _test_eof5;
case 5:
-#line 356 "ext/fastcsv/fastcsv.c"
+#line 412 "ext/fastcsv/fastcsv.c"
switch( (*p) ) {
case 0: goto tr2;
case 10: goto tr3;
case 13: goto tr4;
case 34: goto tr16;
case 44: goto tr5;
}
goto st1;
tr3:
-#line 44 "ext/fastcsv/fastcsv.rl"
+#line 40 "ext/fastcsv/fastcsv.rl"
{
if (p == ts) {
// Unquoted empty fields are nil, not "", in Ruby.
field = Qnil;
}
else if (p > ts) {
- field = rb_str_new(ts, p - ts);
- ASSOCIATE_INDEX;
+ field = rb_enc_str_new(ts, p - ts, encoding);
+ ENCODE;
}
}
-#line 95 "ext/fastcsv/fastcsv.rl"
+#line 91 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
rb_ary_push(row, field);
field = Qnil;
}
rb_yield(row);
row = rb_ary_new();
}
-#line 32 "ext/fastcsv/fastcsv.rl"
+#line 28 "ext/fastcsv/fastcsv.rl"
{
curline++;
}
goto st6;
tr19:
-#line 32 "ext/fastcsv/fastcsv.rl"
+#line 28 "ext/fastcsv/fastcsv.rl"
{
curline++;
}
goto st6;
tr11:
-#line 95 "ext/fastcsv/fastcsv.rl"
+#line 91 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
rb_ary_push(row, field);
field = Qnil;
}
rb_yield(row);
row = rb_ary_new();
}
-#line 32 "ext/fastcsv/fastcsv.rl"
+#line 28 "ext/fastcsv/fastcsv.rl"
{
curline++;
}
goto st6;
st6:
if ( ++p == pe )
goto _test_eof6;
case 6:
-#line 418 "ext/fastcsv/fastcsv.c"
+#line 474 "ext/fastcsv/fastcsv.c"
if ( (*p) == 0 )
goto tr18;
goto tr17;
tr4:
-#line 44 "ext/fastcsv/fastcsv.rl"
+#line 40 "ext/fastcsv/fastcsv.rl"
{
if (p == ts) {
// Unquoted empty fields are nil, not "", in Ruby.
field = Qnil;
}
else if (p > ts) {
- field = rb_str_new(ts, p - ts);
- ASSOCIATE_INDEX;
+ field = rb_enc_str_new(ts, p - ts, encoding);
+ ENCODE;
}
}
-#line 95 "ext/fastcsv/fastcsv.rl"
+#line 91 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
rb_ary_push(row, field);
field = Qnil;
}
rb_yield(row);
row = rb_ary_new();
}
-#line 32 "ext/fastcsv/fastcsv.rl"
+#line 28 "ext/fastcsv/fastcsv.rl"
{
curline++;
}
goto st7;
tr12:
-#line 95 "ext/fastcsv/fastcsv.rl"
+#line 91 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
rb_ary_push(row, field);
field = Qnil;
}
rb_yield(row);
row = rb_ary_new();
}
-#line 32 "ext/fastcsv/fastcsv.rl"
+#line 28 "ext/fastcsv/fastcsv.rl"
{
curline++;
}
goto st7;
st7:
if ( ++p == pe )
goto _test_eof7;
case 7:
-#line 469 "ext/fastcsv/fastcsv.c"
+#line 525 "ext/fastcsv/fastcsv.c"
switch( (*p) ) {
case 0: goto tr18;
case 10: goto tr19;
}
goto tr17;
tr5:
-#line 44 "ext/fastcsv/fastcsv.rl"
+#line 40 "ext/fastcsv/fastcsv.rl"
{
if (p == ts) {
// Unquoted empty fields are nil, not "", in Ruby.
field = Qnil;
}
else if (p > ts) {
- field = rb_str_new(ts, p - ts);
- ASSOCIATE_INDEX;
+ field = rb_enc_str_new(ts, p - ts, encoding);
+ ENCODE;
}
}
-#line 90 "ext/fastcsv/fastcsv.rl"
+#line 86 "ext/fastcsv/fastcsv.rl"
{
rb_ary_push(row, field);
field = Qnil;
}
goto st8;
tr13:
-#line 90 "ext/fastcsv/fastcsv.rl"
+#line 86 "ext/fastcsv/fastcsv.rl"
{
rb_ary_push(row, field);
field = Qnil;
}
goto st8;
st8:
if ( ++p == pe )
goto _test_eof8;
case 8:
-#line 504 "ext/fastcsv/fastcsv.c"
+#line 560 "ext/fastcsv/fastcsv.c"
if ( (*p) == 0 )
goto tr21;
goto tr20;
tr14:
#line 1 "NONE"
{te = p+1;}
-#line 105 "ext/fastcsv/fastcsv.rl"
+#line 101 "ext/fastcsv/fastcsv.rl"
{
if (!NIL_P(field) || RARRAY_LEN(row)) {
rb_ary_push(row, field);
}
if (RARRAY_LEN(row)) {
rb_yield(row);
}
}
-#line 44 "ext/fastcsv/fastcsv.rl"
+#line 40 "ext/fastcsv/fastcsv.rl"
{
if (p == ts) {
// Unquoted empty fields are nil, not "", in Ruby.
field = Qnil;
}
else if (p > ts) {
- field = rb_str_new(ts, p - ts);
- ASSOCIATE_INDEX;
+ field = rb_enc_str_new(ts, p - ts, encoding);
+ ENCODE;
}
}
-#line 129 "ext/fastcsv/fastcsv.rl"
+#line 123 "ext/fastcsv/fastcsv.rl"
{act = 3;}
goto st9;
st9:
if ( ++p == pe )
goto _test_eof9;
case 9:
-#line 538 "ext/fastcsv/fastcsv.c"
+#line 594 "ext/fastcsv/fastcsv.c"
switch( (*p) ) {
case 10: goto tr16;
case 13: goto tr16;
case 34: goto tr16;
case 44: goto tr16;
}
goto st1;
tr8:
-#line 32 "ext/fastcsv/fastcsv.rl"
+#line 28 "ext/fastcsv/fastcsv.rl"
{
curline++;
}
goto st2;
tr15:
-#line 36 "ext/fastcsv/fastcsv.rl"
+#line 32 "ext/fastcsv/fastcsv.rl"
{
unclosed_line = curline;
}
goto st2;
st2:
if ( ++p == pe )
goto _test_eof2;
case 2:
-#line 562 "ext/fastcsv/fastcsv.c"
+#line 618 "ext/fastcsv/fastcsv.c"
switch( (*p) ) {
case 0: goto st0;
case 10: goto tr8;
case 13: goto tr8;
case 34: goto tr9;
@@ -568,15 +624,15 @@
goto st2;
st0:
cs = 0;
goto _out;
tr9:
-#line 55 "ext/fastcsv/fastcsv.rl"
+#line 51 "ext/fastcsv/fastcsv.rl"
{
if (p == ts) {
- field = rb_str_new2("");
- ASSOCIATE_INDEX;
+ field = rb_enc_str_new("", 0, encoding);
+ ENCODE;
}
// @note If we add an action on '""', we can skip some steps if no '""' is found.
else if (p > ts) {
// Operating on ts in-place produces odd behavior, FYI.
char *copy = ALLOC_N(char, p - ts);
@@ -595,28 +651,28 @@
*writer++ = *reader;
}
reader++;
}
- field = rb_str_new(copy, writer - copy);
- ASSOCIATE_INDEX;
+ field = rb_enc_str_new(copy, writer - copy, enc);
+ ENCODE;
if (copy != NULL) {
free(copy);
}
}
}
-#line 40 "ext/fastcsv/fastcsv.rl"
+#line 36 "ext/fastcsv/fastcsv.rl"
{
unclosed_line = 0;
}
goto st3;
st3:
if ( ++p == pe )
goto _test_eof3;
case 3:
-#line 618 "ext/fastcsv/fastcsv.c"
+#line 674 "ext/fastcsv/fastcsv.c"
switch( (*p) ) {
case 0: goto tr10;
case 10: goto tr11;
case 13: goto tr12;
case 34: goto st2;
@@ -648,11 +704,11 @@
}
_out: {}
}
-#line 313 "ext/fastcsv/fastcsv.rl"
+#line 359 "ext/fastcsv/fastcsv.rl"
if (done && cs < fastcsv_first_final) {
if (buf != NULL) {
free(buf);
}
@@ -687,9 +743,13 @@
}
void Init_fastcsv() {
s_read = rb_intern("read");
s_to_str = rb_intern("to_str");
+ s_internal_encoding = rb_intern("internal_encoding");
+ s_external_encoding = rb_intern("external_encoding");
+ s_string = rb_intern("string");
+ s_encoding = rb_intern("encoding");
mModule = rb_define_module("FastCSV");
rb_define_attr(rb_singleton_class(mModule), "buffer_size", 1, 1);
rb_define_singleton_method(mModule, "raw_parse", fastcsv, -1);
rb_eParseError = rb_define_class_under(mModule, "ParseError", rb_eStandardError);