/* url-encoded parsing */ #include "nyara.h" #include static char _half_octet(char c) { // there's a faster way but not validating the range: // #define hex2c(c) ((c | 32) % 39 - 9) if (c >= '0' && c <= '9') { return c - '0'; } else if (c >= 'A' && c <= 'F') { return c - 'A' + 10; } else if (c >= 'a' && c <= 'f') { return c - 'a' + 10; } else { return -1; } } static long _decode_url_seg(VALUE output, const char*s, long len, char stop_char) { const char* last_s = s; long last_len = 0; # define FLUSH_UNESCAPED\ if (last_len) {\ rb_str_cat(output, last_s, last_len);\ last_s += last_len;\ last_len = 0;\ } long i; for (i = 0; i < len; i++) { if (s[i] == '%') { if (i + 2 >= len) { last_len++; continue; } char r1 = _half_octet(s[i + 1]); if (r1 < 0) { last_len++; continue; } char r2 = _half_octet(s[i + 2]); if (r2 < 0) { last_len++; continue; } i += 2; unsigned char r = ((unsigned char)r1 << 4) | (unsigned char)r2; FLUSH_UNESCAPED; last_s += 3; rb_str_cat(output, (char*)&r, 1); } else if (s[i] == stop_char) { i++; break; } else if (s[i] == '+') { FLUSH_UNESCAPED; rb_str_cat(output, " ", 1); } else { last_len++; } } FLUSH_UNESCAPED; # undef FLUSH_UNESCAPED return i; } // s should contain no space // return parsed len, s + return == start of query // NOTE it's similar to _decode_url_seg, but: // - "+" is not escaped // - matrix uri params (segments starting with ";") are ignored long nyara_parse_path(VALUE output, const char* s, long len) { const char* last_s = s; long last_len = 0; # define FLUSH_UNESCAPED\ if (last_len) {\ rb_str_cat(output, last_s, last_len);\ last_s += last_len;\ last_len = 0;\ } long i; for (i = 0; i < len; i++) { if (s[i] == '%') { if (i + 2 >= len) { last_len++; continue; } char r1 = _half_octet(s[i + 1]); if (r1 < 0) { last_len++; continue; } char r2 = _half_octet(s[i + 2]); if (r2 < 0) { last_len++; continue; } i += 2; unsigned char r = ((unsigned char)r1 << 4) | (unsigned char)r2; FLUSH_UNESCAPED; last_s += 3; rb_str_cat(output, (char*)&r, 1); } else if (s[i] == ';') { // skip matrix uri params i++; for (; i < len; i++) { if (s[i] == '?') { i++; break; } } break; } else if (s[i] == '?') { i++; break; } else { last_len++; } } FLUSH_UNESCAPED; # undef FLUSH_UNESCAPED return i; } static VALUE ext_parse_path(VALUE self, VALUE output, VALUE input) { long parsed = nyara_parse_path(output, RSTRING_PTR(input), RSTRING_LEN(input)); return ULONG2NUM(parsed); } static void _error(const char* msg, const char* s, long len, long segment_i) { if (s) { rb_raise(rb_eRuntimeError, "error parsing \"%.*s\": segments[%ld] is %s", (int)len, s, segment_i, msg); } else { rb_raise(rb_eRuntimeError, "%s", msg); } } static VALUE _new_child(long hash) { return hash ? rb_class_new_instance(0, NULL, nyara_param_hash_class) : rb_ary_new(); } // a, b, c = keys; h[a][b][c] = value // the last 2 args are for error report static void _aset_keys(VALUE output, volatile VALUE keys, VALUE value, const char* kv_s, long kv_len) { VALUE* arr = RARRAY_PTR(keys); long len = RARRAY_LEN(keys); if (!len) { rb_bug("bug: aset 0 length key"); return; } // first key seg long is_hash_key = 1; // middle key segs for (long i = 0; i < len - 1; i++) { long next_is_hash_key = RSTRING_LEN(arr[i + 1]); if (is_hash_key) { if (nyara_rb_hash_has_key(output, arr[i])) { output = rb_hash_aref(output, arr[i]); if (next_is_hash_key) { if (TYPE(output) != T_HASH) { // note: StringValueCStr requires VALUE* as param, and can raise another error if there's nul in the string _error("not array index (expect to be empty)", kv_s, kv_len, i); } } else { if (TYPE(output) != T_ARRAY) { _error("not hash key (expect to be non-empty)", kv_s, kv_len, i); } } } else { volatile VALUE child = _new_child(next_is_hash_key); rb_hash_aset(output, arr[i], child); output = child; } } else { volatile VALUE child = _new_child(next_is_hash_key); rb_ary_push(output, child); output = child; } is_hash_key = next_is_hash_key; } // terminate key seg: add value if (is_hash_key) { rb_hash_aset(output, arr[len - 1], value); } else { rb_ary_push(output, value); } } static const char* _strnchr(const char* s, long len, char c) { for (long i = 0; i < len; i++) { if (s[i] == c) { return s + i; } } return NULL; } static inline VALUE _new_blank_str() { return rb_enc_str_new("", 0, u8_encoding); } static void _url_encoded_seg(VALUE output, const char* kv_s, long kv_len, int nested_mode) { // (note if we _decode_url_seg with '&' first, then there may be multiple '='s in one kv) const char* s = kv_s; long len = kv_len; if (!len) { return; } volatile VALUE value = _new_blank_str(); // rule out the value part { // strnstr is not available on linux :( const char* value_s = _strnchr(s, len, '='); if (value_s) { value_s++; long value_len = s + len - value_s; long parsed = _decode_url_seg(value, value_s, value_len, '&'); if (parsed != value_len) { rb_raise(rb_eArgError, "separator & in param segment"); } len = value_s - s - 1; } // starts with '=' if (value_s == s) { rb_hash_aset(output, _new_blank_str(), value); return; } } volatile VALUE key = _new_blank_str(); if (nested_mode) { // todo fault-tolerant? long parsed = _decode_url_seg(key, s, len, '['); if (parsed == len) { rb_hash_aset(output, key, value); return; } s += parsed; len -= parsed; volatile VALUE keys = rb_ary_new3(1, key); while (len) { key = _new_blank_str(); parsed = _decode_url_seg(key, s, len, ']'); rb_ary_push(keys, key); s += parsed; len -= parsed; if (len) { if (s[0] == '[') { s++; len--; } else { rb_raise(rb_eRuntimeError, "malformed params: remaining chars in key but not starting with '['"); return; } } } _aset_keys(output, keys, value, kv_s, kv_len); } else { _decode_url_seg(key, s, len, '='); rb_hash_aset(output, key, value); } return; } // "a[%20][][b]=c" ===> output["a", "\x20", nil, "b"] = "c" static VALUE ext_parse_url_encoded_seg(VALUE self, VALUE output, VALUE kv, VALUE v_nested_mode) { _url_encoded_seg(output, RSTRING_PTR(kv), RSTRING_LEN(kv), RTEST(v_nested_mode)); return output; } void nyara_parse_param(VALUE output, const char* s, long len) { // split with /[&;] */ long last_i = 0; long i = 0; for (; i < len; i++) { if (s[i] == '&' || s[i] == ';') { if (i > last_i) { _url_encoded_seg(output, s + last_i, i - last_i, 1); } while(i + 1 < len && s[i + 1] == ' ') { i++; } last_i = i + 1; } } if (i > last_i) { _url_encoded_seg(output, s + last_i, i - last_i, 1); } } static VALUE ext_parse_param(VALUE self, VALUE output, VALUE s) { nyara_parse_param(output, RSTRING_PTR(s), RSTRING_LEN(s)); return output; } static VALUE _cookie_seg_str_new(const char* s, long len) { // trim tailing space for (; len > 0; len--) { if (s[len - 1] != ' ') { break; } } return rb_enc_str_new(s, len, u8_encoding); } VALUE ext_parse_cookie(VALUE self, VALUE output, VALUE str) { volatile VALUE arr = rb_ary_new(); const char* s = RSTRING_PTR(str); long len = RSTRING_LEN(str); // split with / *[,;] */ long last_i = 0; long i = 0; for (; i < len; i++) { if (s[i] == ',' || s[i] == ';') { // char* and len parse_seg if (i > last_i) { rb_ary_push(arr, _cookie_seg_str_new(s + last_i, i - last_i)); } while(i + 1 < len && s[i + 1] == ' ') { i++; } last_i = i + 1; } } if (i > last_i) { rb_ary_push(arr, _cookie_seg_str_new(s + last_i, i - last_i)); } VALUE* arr_p = RARRAY_PTR(arr); for (long j = RARRAY_LEN(arr) - 1; j >= 0; j--) { _url_encoded_seg(output, RSTRING_PTR(arr_p[j]), RSTRING_LEN(arr_p[j]), 0); } return output; } static bool _should_escape(char c) { return !isalnum(c) && c != '_' && c != '.' && c != '-'; } // prereq: n always < 16 static char _hex_char(unsigned char n) { if (n < 10) { return '0' + n; } else { return 'A' + (n - 10); } } static void _concat_char(VALUE s, char c, bool ispath) { static char buf[3] = {'%', 0, 0}; static char plus[1] = {'+'}; if (ispath) { if (_should_escape(c) && c != '+' && c != '/') { buf[1] = _hex_char((unsigned char)c / 16); buf[2] = _hex_char((unsigned char)c % 16); rb_str_cat(s, buf, 3); } else { rb_str_cat(s, &c, 1); } } else { if (c == ' ') { rb_str_cat(s, plus, 1); } else if (_should_escape(c)) { buf[1] = _hex_char((unsigned char)c / 16); buf[2] = _hex_char((unsigned char)c % 16); rb_str_cat(s, buf, 3); } else { rb_str_cat(s, &c, 1); } } } // escape for uri path ('/', '+' are not changed) or component ('/', '+' are changed) static VALUE ext_escape(VALUE _, VALUE s, VALUE v_ispath) { Check_Type(s, T_STRING); long len = RSTRING_LEN(s); const char* ptr = RSTRING_PTR(s); volatile VALUE res = rb_str_buf_new(len); bool ispath = RTEST(v_ispath); for (long i = 0; i < len; i++) { _concat_char(res, ptr[i], ispath); } rb_enc_associate(res, u8_encoding); return res; } // nil in keys will be interpreted as array key static VALUE ext_param_hash_nested_aset(VALUE _, VALUE output, VALUE keys, VALUE value) { // todo check output is ParamHash Check_Type(keys, T_ARRAY); _aset_keys(output, keys, value, NULL, 0); return Qnil; } void Init_url_encoded(VALUE ext) { rb_define_singleton_method(ext, "parse_param", ext_parse_param, 2); rb_define_singleton_method(ext, "parse_cookie", ext_parse_cookie, 2); rb_define_singleton_method(ext, "escape", ext_escape, 2); rb_define_singleton_method(ext, "param_hash_nested_aset", ext_param_hash_nested_aset, 3); // for test rb_define_singleton_method(ext, "parse_url_encoded_seg", ext_parse_url_encoded_seg, 3); rb_define_singleton_method(ext, "parse_path", ext_parse_path, 2); }