#include #include #include /** * This function works by maintaining a state machine that tracks which * characters are allowed at any given state, and reacting appropriately. The * state machine looks like: * * ┌ N ┐ ┌ C ┐ * │ v │ v * ┌─────────────┐ ┌─────────────┐ * │ │ │ │ * ──>│ START │──── C ───>│ CHAR │ * │ │ │ │ * └─────────────┘ └─────────────┘ * │ ^ * ┌ N ┐ │ │ * │ v │ │ * ┌─────────────┐ │ | * │ │<─── N ────────┘ | * │ SEP │ | * │ │──── C ──────────────┘ * └─────────────┘ * * Where C represents a character that is allowed in the output and N represents * a character that is not allowed in the output. */ // This is the current state of the state machine being used to compile the // parameterized string. enum state { STATE_START, STATE_CHAR, STATE_SEP }; // This is used to tell if the given character is allowed to be part of the // output string as opposed to being replaced by the separator. static inline int ischar(int character) { return ( (character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') || (character >= '0' && character <= '9') || character == '-' || character == '_' ); } // This is a function that preserves the same interface as tolower that can be // swapped in its place in case the preserve_case option was set to true. static int tosame(int character) { return character; } // Makes a parameterized, lowercase form from the expression in the string. static VALUE parameterize(VALUE string, VALUE kwargs) { static ID keywords[2]; CONST_ID(keywords[0], "separator"); CONST_ID(keywords[1], "preserve_case"); VALUE options[2] = { Qundef, Qundef }; rb_get_kwargs(kwargs, keywords, 0, 2, options); // Configure the options based on the inputted keyword arguments const char *separator = options[0] == Qundef ? "-" : StringValueCStr(options[0]); size_t separator_size = strlen(separator); int(*tocase)(int) = options[1] == Qtrue ? tosame : tolower; // Set the initial state and build a buffer for the resulting string enum state state = STATE_START; char result[RSTRING_LEN(string) * separator_size]; size_t size = 0; // Set up a couple of variables necessary for iterating through the string rb_encoding *encoding = rb_enc_from_index(ENCODING_GET(string)); int codepoint_size; int character; // Determine the bounds of the string for our loop char *pointer = RSTRING_PTR(string); char *end = RSTRING_END(string); while (pointer < end) { character = rb_enc_codepoint_len(pointer, end, &codepoint_size, encoding); pointer += codepoint_size; switch (state) { case STATE_START: if (ischar(character)) { result[size++] = tocase(character); state = STATE_CHAR; } break; case STATE_CHAR: if (ischar(character)) { result[size++] = tocase(character); } else { state = STATE_SEP; } break; case STATE_SEP: if (ischar(character)) { strncpy(result + size, separator, separator_size); size += separator_size; result[size++] = tocase(character); state = STATE_CHAR; } break; } } return rb_enc_str_new(result, size, encoding); } // FastParameterize::parameterize static VALUE fast_parameterize(int argc, VALUE* argv, VALUE self) { VALUE string = Qnil; VALUE kwargs = Qnil; rb_scan_args(argc, argv, "10:", &string, &kwargs); return RB_TYPE_P(string, T_STRING) ? parameterize(string, kwargs) : Qnil; } // String#parameterize static VALUE string_parameterize(int argc, VALUE* argv, VALUE self) { VALUE kwargs = Qnil; rb_scan_args(argc, argv, "00:", &kwargs); return parameterize(self, kwargs); } // Hook into Ruby and define the FastParameterize::parameterize and // String#parameterize. void Init_fast_parameterize(void) { VALUE rb_cFastParameterize = rb_define_module("FastParameterize"); rb_define_singleton_method(rb_cFastParameterize, "parameterize", fast_parameterize, -1); rb_define_method(rb_cString, "parameterize", string_parameterize, -1); }