#include #include #include /** * true if the given codepoint is a lowercase ascii character. */ static int character_is_lower(unsigned int character) { return character >= 'a' && character <= 'z'; } /** * true if the given codepoint is a uppercase ascii character. */ static int character_is_upper(unsigned int character) { return character >= 'A' && character <= 'Z'; } /** * true if the given codepoint is an ascii digit. */ static int character_is_digit(unsigned int character) { return character >= '0' && character <= '9'; } /** * Macros for extracting the character out of the `codepoint_t` struct. */ #define codepoint_is_lower(codepoint) character_is_lower(codepoint->character) #define codepoint_is_upper(codepoint) character_is_upper(codepoint->character) #define codepoint_is_digit(codepoint) character_is_digit(codepoint->character) /** * Macros for shortcuts to call into the `builder_result_push_char` function. */ #define builder_result_push(builder, codepoint) \ builder_result_push_char(builder, codepoint->character, codepoint->size, \ codepoint->encoding) #define builder_result_push_literal(builder, character) \ builder_result_push_char(builder, character, 1, NULL) /** * A struct for keeping track of a codepoint from the original string. Maintains * the original encoding, the actual character codepoint, and the size of the * codepoint. */ typedef struct codepoint { // The encoding of the character rb_encoding *encoding; // Representation of a character, regardless of encoding unsigned int character; // The size that this character actually represents int size; } codepoint_t; /** * A struct for tracking the built string as it gets converted. Maintains an * internal DFA for transitioning through various inputs to match certain * patterns that need to be separated with underscores. */ typedef struct builder { // The state of the DFA in which is the builder enum state { STATE_DEFAULT, STATE_COLON, STATE_UPPER_END, STATE_UPPER_START } state; // The current segment of text that we're analyzing char *segment; long segment_size; // The resultant text from the underscore operation char *result; long result_size; // Whether or not the last pushed result character should cause the following // one to be spaced by an underscore int pushNext; } builder_t; /** * Allocate and initialize a `codepoint_t` struct. */ static codepoint_t* codepoint_build(rb_encoding *encoding) { codepoint_t *codepoint; codepoint = (codepoint_t *) malloc(sizeof(codepoint_t)); if (codepoint == NULL) { return NULL; } codepoint->encoding = encoding; return codepoint; } /** * Free a previously allocated `codepoint_t` struct. */ static void codepoint_free(codepoint_t *codepoint) { free(codepoint); } /** * Allocate and initialize a `builder_t` struct. */ static builder_t* builder_build(long str_len) { builder_t *builder; builder = (builder_t *) malloc(sizeof(builder_t)); if (builder == NULL) { return NULL; } builder->state = STATE_DEFAULT; builder->segment = (char *) malloc(str_len * sizeof(unsigned int) * 2); if (builder->segment == NULL) { free(builder); return NULL; } builder->result = (char *) malloc(str_len * sizeof(unsigned int) * 2); if (builder->result == NULL) { free(builder->segment); free(builder); return NULL; } builder->segment_size = 0; builder->result_size = 0; builder->pushNext = 0; return builder; } /** * Push a character onto the resultant string using the given codepoint and * encoding. */ static void builder_result_push_char(builder_t *builder, unsigned int character, int size, rb_encoding *encoding) { if (character_is_upper(character)) { if (builder->pushNext == 1) { builder->pushNext = 0; builder_result_push_literal(builder, '_'); } builder->result[builder->result_size++] = (char) character - 'A' + 'a'; return; } builder->pushNext = (character_is_lower(character) || character_is_digit(character)); if (encoding == NULL) { builder->result[builder->result_size++] = (char) character; } else { rb_enc_mbcput(character, &builder->result[builder->result_size], encoding); builder->result_size += size; } } /** * Push the given codepoint onto the builder. */ static void builder_segment_push(builder_t *builder, codepoint_t *codepoint) { builder->segment[builder->segment_size++] = (char) codepoint->character; } /** * Copy the given number of characters out of the segment cache onto the result * string. */ static void builder_segment_copy(builder_t *builder, long size) { long idx; for (idx = 0; idx < size; idx++) { builder_result_push_literal(builder, builder->segment[idx]); } } /** * Restart the `builder_t` back at the default state (because we've hit a * character for which we have no allowed transitions). */ static void builder_restart(builder_t *builder) { builder->state = STATE_DEFAULT; builder->segment_size = 0; } static void builder_next(builder_t *builder, codepoint_t *codepoint); /** * Pull the remaining content out of the cached segment in case we don't end * parsing while not in the default state. */ static void builder_flush(builder_t *builder) { switch (builder->state) { case STATE_DEFAULT: return; case STATE_COLON: builder_result_push_literal(builder, ':'); return; case STATE_UPPER_END: case STATE_UPPER_START: builder_segment_copy(builder, builder->segment_size); return; } } /** * Perform transitions from the STATE_DEFAULT state. */ static inline void builder_default_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint->character == '-') { builder_result_push_literal(builder, '_'); return; } if (codepoint->character == ':') { builder->state = STATE_COLON; return; } if (codepoint_is_digit(codepoint) || codepoint_is_upper(codepoint)) { builder->segment[0] = (char) codepoint->character; builder->segment_size = 1; builder->state = STATE_UPPER_START; return; } builder_result_push(builder, codepoint); } /** * Perform transitions from the STATE_COLON state. */ static inline void builder_colon_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint->character == ':') { builder_result_push_literal(builder, '/'); builder_restart(builder); return; } builder_result_push_literal(builder, ':'); builder_restart(builder); builder_next(builder, codepoint); } /** * Perform transitions from the STATE_UPPER_START state. */ static inline void builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint_is_digit(codepoint)) { builder_segment_push(builder, codepoint); return; } if (codepoint_is_upper(codepoint)) { builder_segment_push(builder, codepoint); builder->state = STATE_UPPER_END; return; } builder_segment_copy(builder, builder->segment_size); builder_restart(builder); builder_next(builder, codepoint); } /** * Perform transitions from the STATE_UPPER_END state. */ static inline void builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint_is_digit(codepoint)) { builder_segment_push(builder, codepoint); builder->state = STATE_UPPER_START; return; } if (codepoint_is_upper(codepoint)) { builder_segment_push(builder, codepoint); return; } if (codepoint_is_lower(codepoint)) { builder_segment_copy(builder, builder->segment_size - 1); builder_result_push_literal(builder, '_'); builder_result_push_literal(builder, builder->segment[builder->segment_size - 1]); builder_restart(builder); builder_next(builder, codepoint); return; } builder_segment_copy(builder, builder->segment_size); builder_restart(builder); builder_next(builder, codepoint); } /** * Accept the next codepoint, which will move the `builder_t` struct into the * next state. */ static void builder_next(builder_t *builder, codepoint_t *codepoint) { switch (builder->state) { case STATE_DEFAULT: return builder_default_transition(builder, codepoint); case STATE_COLON: return builder_colon_transition(builder, codepoint); case STATE_UPPER_START: return builder_upper_start_transition(builder, codepoint); case STATE_UPPER_END: return builder_upper_end_transition(builder, codepoint); } } /** * Frees a previously allocated `builder_t` struct. */ static void builder_free(builder_t *builder) { free(builder->segment); free(builder->result); free(builder); } /** * Makes an underscored, lowercase form from the expression in the string. * * Changes '::' to '/' to convert namespaces to paths. * * underscore('ActiveModel') # => "active_model" * underscore('ActiveModel::Errors') # => "active_model/errors" * * As a rule of thumb you can think of +underscore+ as the inverse of * #camelize, though there are cases where that does not hold: * * camelize(underscore('SSLError')) # => "SslError" */ static VALUE str_underscore(VALUE rb_string) { VALUE resultant; rb_encoding *encoding; char *string; char *end; builder_t *builder; codepoint_t *codepoint; encoding = rb_enc_from_index(ENCODING_GET(rb_string)); string = RSTRING_PTR(rb_string); end = RSTRING_END(rb_string); builder = builder_build(RSTRING_LEN(rb_string) * 2); codepoint = codepoint_build(encoding); while (string < end) { codepoint->character = rb_enc_codepoint_len(string, end, &codepoint->size, encoding); builder_next(builder, codepoint); string += codepoint->size; } builder_flush(builder); resultant = rb_enc_str_new(builder->result, builder->result_size, encoding); builder_free(builder); codepoint_free(codepoint); return resultant; } /** * A singleton method calls with a string that delegates to `str_underscore` to * form an underscored, lowercase form from the expression in the string. */ static VALUE rb_str_underscore(VALUE self, VALUE rb_string) { return str_underscore(rb_string); } /** * Hook into Ruby and define the `FastUnderscore::underscore`. */ void Init_fast_underscore(void) { VALUE rb_cFastUnderscore = rb_define_module("FastUnderscore"); rb_define_singleton_method(rb_cFastUnderscore, "underscore", rb_str_underscore, 1); rb_define_method(rb_cString, "underscore", str_underscore, 0); }