#include #ifdef HAVE_RUBY_ENCODING_H #include static rb_encoding *utf8Encoding; #endif #define IS_HEX(c) (c >= 48 || c <= 57) && (c >= 65 || c <= 70) && (c >= 97 || c <= 102) #define NOT_HEX(c) (c < 48 || c > 57) && (c < 65 || c > 90) && (c < 97 || c > 122) #define UNHEX(c) (c >= '0' && c <= '9' ? c - '0' : c >= 'A' && c <= 'F' ? c - 'A' + 10 : c - 'a' + 10) static size_t escape_html(unsigned char *out, const unsigned char *in, size_t in_len) { size_t total = 0; unsigned char curChar; total = in_len; while (in_len) { curChar = *in++; switch (curChar) { case '<': *out++ = '&'; *out++ = 'l'; *out++ = 't'; *out++ = ';'; total += 3; break; case '>': *out++ = '&'; *out++ = 'g'; *out++ = 't'; *out++ = ';'; total += 3; break; case '&': *out++ = '&'; *out++ = 'a'; *out++ = 'm'; *out++ = 'p'; *out++ = ';'; total += 4; break; case '\'': *out++ = '&'; *out++ = '#'; *out++ = '3'; *out++ = '9'; *out++ = ';'; total += 4; break; case '\"': *out++ = '&'; *out++ = 'q'; *out++ = 'u'; *out++ = 'o'; *out++ = 't'; *out++ = ';'; total += 5; break; case '/': *out++ = '&'; *out++ = '#'; *out++ = '4'; *out++ = '7'; *out++ = ';'; total += 4; break; default: *out++ = curChar; break; } in_len--; } return total; } static size_t unescape_html(unsigned char *out, const unsigned char *in, size_t in_len) { size_t total = 0, len = in_len; unsigned char curChar, *start; start = (unsigned char *)&in[0]; total = in_len; while (len) { curChar = *in++; if (curChar == '&') { if ((in-start)+2 <= in_len && *in == 'l' && *(in+1) == 't' && *(in+2) == ';') { *out++ = '<'; total-=3; in+=3; len-=3; } else if ((in-start)+2 <= in_len && *in == 'g' && *(in+1) == 't' && *(in+2) == ';') { *out++ = '>'; total-=3; in+=3; len-=3; } else if ((in-start)+3 <= in_len && *in == 'a' && *(in+1) == 'm' && *(in+2) == 'p' && *(in+3) == ';') { *out++ = '&'; total-=4; in+=4; len-=4; } else if ((in-start)+3 <= in_len && *in == '#' && *(in+1) == '3' && *(in+2) == '9' && *(in+3) == ';') { *out++ = '\''; total-=4; in+=4; len-=4; } else if ((in-start)+3 <= in_len && *in == '#' && *(in+1) == '4' && *(in+2) == '7' && *(in+3) == ';') { *out++ = '/'; total-=4; in+=4; len-=4; } else if ((in-start)+4 <= in_len && *in == 'q' && *(in+1) == 'u' && *(in+2) == 'o' && *(in+3) == 't' && *(in+4) == ';') { *out++ = '\"'; total-=5; in+=5; len-=5; } } else { *out++ = curChar; } len--; } return total; } static size_t escape_javascript(unsigned char *out, const unsigned char *in, size_t in_len) { size_t total = 0; unsigned char curChar; total = in_len; while (in_len) { curChar = *in++; switch (curChar) { case '\\': *out++ = '\\'; *out++ = '\\'; total++; break; case '<': if (*in == '/') { *out++ = '<'; *out++ = '\\'; *out++ = '/'; in++; in_len--; total++; } break; case '\r': if (*in == '\n') { *out++ = '\\'; *out++ = 'n'; in++; in_len--; } else { *out++ = '\\'; *out++ = 'n'; total++; } break; case '\n': *out++ = '\\'; *out++ = 'n'; total++; break; case '\'': *out++ = '\\'; *out++ = '\''; total++; break; case '\"': *out++ = '\\'; *out++ = '\"'; total++; break; default: *out++ = curChar; break; } in_len--; } return total; } static size_t unescape_javascript(unsigned char *out, const unsigned char *in, size_t in_len) { size_t total = 0; unsigned char curChar; total = in_len; while (in_len) { curChar = *in++; if (curChar == '\\') { if (*in == 'n') { *out++ = '\n'; total--; } else if (*in == '\\') { *out++ = '\\'; total--; } else if (*in == '\'') { *out++ = '\''; total--; } else if (*in == '\"') { *out++ = '\"'; total--; } else if (*in == '/') { *out++ = '/'; total--; } else { *out++ = curChar; } in++; in_len--; } else { *out++ = curChar; } in_len--; } return total; } static size_t escape_url(unsigned char *out, const unsigned char *in, size_t in_len) { size_t total = 0; unsigned char curChar, hex[2]; const unsigned char hexChars[16] = "0123456789ABCDEF"; total = in_len; while (in_len) { curChar = *in++; if (curChar == ' ') { *out++ = '+'; } else if ((curChar != '_' && curChar != '.' && curChar != '-') && NOT_HEX(curChar)) { hex[1] = hexChars[curChar & 0x0f]; hex[0] = hexChars[(curChar >> 4) & 0x0f]; *out++ = '%'; *out++ = hex[0]; *out++ = hex[1]; total += 2; } else { *out++ = curChar; } in_len--; } return total; } static size_t unescape_url(unsigned char *out, const unsigned char *in, size_t in_len) { size_t total = 0, len = in_len; unsigned char curChar, *start; start = (unsigned char *)&in[0]; total = in_len; while (len) { curChar = *in++; if (curChar == '%') { if ((in-start)+2 <= in_len && IS_HEX(*in) && IS_HEX(*(in+1))) { *out++ = (UNHEX(*in) << 4) + UNHEX(*(in+1)); in+=2; total-=2; } } else if (curChar == '+') { *out++ = ' '; } else { *out++ = curChar; } len--; } return total; } static VALUE rb_escape_html(VALUE self, VALUE str) { Check_Type(str, T_STRING); VALUE rb_output_buf; #ifdef HAVE_RUBY_ENCODING_H rb_encoding *default_internal_enc = rb_default_internal_encoding(); rb_encoding *original_encoding = rb_enc_get(str); #endif unsigned char *inBuf = (unsigned char*)RSTRING_PTR(str); size_t len = RSTRING_LEN(str), new_len = 0; // this is the max size the string could be // TODO: we should try to be more intelligent about this unsigned char *outBuf = (unsigned char *)malloc(sizeof(unsigned char *)*(len*5)); // perform our escape, returning the new string's length new_len = escape_html(outBuf, inBuf, len); // create our new ruby string rb_output_buf = rb_str_new((char *)outBuf, new_len); // free the temporary C string free(outBuf); #ifdef HAVE_RUBY_ENCODING_H rb_enc_associate(rb_output_buf, original_encoding); if (default_internal_enc) { rb_output_buf = rb_str_export_to_enc(rb_output_buf, default_internal_enc); } else { rb_output_buf = rb_str_export_to_enc(rb_output_buf, utf8Encoding); } #endif return rb_output_buf; } static VALUE rb_unescape_html(VALUE self, VALUE str) { Check_Type(str, T_STRING); VALUE rb_output_buf; #ifdef HAVE_RUBY_ENCODING_H rb_encoding *default_internal_enc = rb_default_internal_encoding(); rb_encoding *original_encoding = rb_enc_get(str); #endif unsigned char *inBuf = (unsigned char*)RSTRING_PTR(str); size_t len = RSTRING_LEN(str), new_len = 0; // this is the max size the string could be // TODO: we should try to be more intelligent about this unsigned char *outBuf = (unsigned char *)malloc(sizeof(unsigned char *)*len); // perform our escape, returning the new string's length new_len = unescape_html(outBuf, inBuf, len); // create our new ruby string rb_output_buf = rb_str_new((char *)outBuf, new_len); // free the temporary C string free(outBuf); #ifdef HAVE_RUBY_ENCODING_H rb_enc_associate(rb_output_buf, original_encoding); if (default_internal_enc) { rb_output_buf = rb_str_export_to_enc(rb_output_buf, default_internal_enc); } else { rb_output_buf = rb_str_export_to_enc(rb_output_buf, utf8Encoding); } #endif return rb_output_buf; } static VALUE rb_escape_javascript(VALUE self, VALUE str) { if (str == Qnil) { return rb_str_new2(""); } Check_Type(str, T_STRING); VALUE rb_output_buf; #ifdef HAVE_RUBY_ENCODING_H rb_encoding *default_internal_enc = rb_default_internal_encoding(); rb_encoding *original_encoding = rb_enc_get(str); #endif unsigned char *inBuf = (unsigned char*)RSTRING_PTR(str); size_t len = RSTRING_LEN(str), new_len = 0; // this is the max size the string could be // TODO: we should try to be more intelligent about this unsigned char *outBuf = (unsigned char *)malloc(sizeof(unsigned char *)*(len*2)); // perform our escape, returning the new string's length new_len = escape_javascript(outBuf, inBuf, len); // create our new ruby string rb_output_buf = rb_str_new((char *)outBuf, new_len); // free the temporary C string free(outBuf); #ifdef HAVE_RUBY_ENCODING_H rb_enc_associate(rb_output_buf, original_encoding); if (default_internal_enc) { rb_output_buf = rb_str_export_to_enc(rb_output_buf, default_internal_enc); } else { rb_output_buf = rb_str_export_to_enc(rb_output_buf, utf8Encoding); } #endif return rb_output_buf; } static VALUE rb_unescape_javascript(VALUE self, VALUE str) { if (str == Qnil) { return rb_str_new2(""); } Check_Type(str, T_STRING); VALUE rb_output_buf; #ifdef HAVE_RUBY_ENCODING_H rb_encoding *default_internal_enc = rb_default_internal_encoding(); rb_encoding *original_encoding = rb_enc_get(str); #endif unsigned char *inBuf = (unsigned char*)RSTRING_PTR(str); size_t len = RSTRING_LEN(str), new_len = 0; // this is the max size the string could be // TODO: we should try to be more intelligent about this unsigned char *outBuf = (unsigned char *)malloc(sizeof(unsigned char *)*len); // perform our escape, returning the new string's length new_len = unescape_javascript(outBuf, inBuf, len); // create our new ruby string rb_output_buf = rb_str_new((char *)outBuf, new_len); // free the temporary C string free(outBuf); #ifdef HAVE_RUBY_ENCODING_H rb_enc_associate(rb_output_buf, original_encoding); if (default_internal_enc) { rb_output_buf = rb_str_export_to_enc(rb_output_buf, default_internal_enc); } else { rb_output_buf = rb_str_export_to_enc(rb_output_buf, utf8Encoding); } #endif return rb_output_buf; } static VALUE rb_escape_url(VALUE self, VALUE str) { Check_Type(str, T_STRING); VALUE rb_output_buf; #ifdef HAVE_RUBY_ENCODING_H rb_encoding *default_internal_enc = rb_default_internal_encoding(); rb_encoding *original_encoding = rb_enc_get(str); #endif unsigned char *inBuf = (unsigned char*)RSTRING_PTR(str); size_t len = RSTRING_LEN(str), new_len = 0; // this is the max size the string could be // TODO: we should try to be more intelligent about this unsigned char *outBuf = (unsigned char *)malloc(sizeof(unsigned char *)*(len*3)); // perform our escape, returning the new string's length new_len = escape_url(outBuf, inBuf, len); // create our new ruby string rb_output_buf = rb_str_new((char *)outBuf, new_len); // free the temporary C string free(outBuf); #ifdef HAVE_RUBY_ENCODING_H rb_enc_associate(rb_output_buf, original_encoding); if (default_internal_enc) { rb_output_buf = rb_str_export_to_enc(rb_output_buf, default_internal_enc); } else { rb_output_buf = rb_str_export_to_enc(rb_output_buf, utf8Encoding); } #endif return rb_output_buf; } static VALUE rb_unescape_url(VALUE self, VALUE str) { Check_Type(str, T_STRING); VALUE rb_output_buf; #ifdef HAVE_RUBY_ENCODING_H rb_encoding *default_internal_enc = rb_default_internal_encoding(); rb_encoding *original_encoding = rb_enc_get(str); #endif unsigned char *inBuf = (unsigned char*)RSTRING_PTR(str); size_t len = RSTRING_LEN(str), new_len = 0; // this is the max size the string could be // TODO: we should try to be more intelligent about this unsigned char *outBuf = (unsigned char *)malloc(sizeof(unsigned char *)*len); // perform our escape, returning the new string's length new_len = unescape_url(outBuf, inBuf, len); // create our new ruby string rb_output_buf = rb_str_new((char *)outBuf, new_len); // free the temporary C string free(outBuf); #ifdef HAVE_RUBY_ENCODING_H rb_enc_associate(rb_output_buf, original_encoding); if (default_internal_enc) { rb_output_buf = rb_str_export_to_enc(rb_output_buf, default_internal_enc); } else { rb_output_buf = rb_str_export_to_enc(rb_output_buf, utf8Encoding); } #endif return rb_output_buf; } /* Ruby Extension initializer */ void Init_escape_utils_ext() { VALUE mEscape = rb_define_module("EscapeUtils"); rb_define_method(mEscape, "escape_html", rb_escape_html, 1); rb_define_module_function(mEscape, "escape_html", rb_escape_html, 1); rb_define_method(mEscape, "unescape_html", rb_unescape_html, 1); rb_define_module_function(mEscape, "unescape_html", rb_unescape_html, 1); rb_define_method(mEscape, "escape_javascript", rb_escape_javascript, 1); rb_define_module_function(mEscape, "escape_javascript", rb_escape_javascript, 1); rb_define_method(mEscape, "unescape_javascript", rb_unescape_javascript, 1); rb_define_module_function(mEscape, "unescape_javascript", rb_unescape_javascript, 1); rb_define_method(mEscape, "escape_url", rb_escape_url, 1); rb_define_module_function(mEscape, "escape_url", rb_escape_url, 1); rb_define_method(mEscape, "unescape_url", rb_unescape_url, 1); rb_define_module_function(mEscape, "unescape_url", rb_unescape_url, 1); #ifdef HAVE_RUBY_ENCODING_H utf8Encoding = rb_utf8_encoding(); #endif }