#include #include // true if the given codepoint is a lowercase ascii character. static int character_is_lower(unsigned int character) { return character >= 'a' && character <= 'z'; } // true if the given codepoint is a uppercase ascii character. static int character_is_upper(unsigned int character) { return character >= 'A' && character <= 'Z'; } // true if the given codepoint is an ascii digit. static int character_is_digit(unsigned int character) { return character >= '0' && character <= '9'; } // Macros for extracting the character out of the `codepoint_t` struct. #define codepoint_is_lower(codepoint) character_is_lower(codepoint->character) #define codepoint_is_upper(codepoint) character_is_upper(codepoint->character) #define codepoint_is_digit(codepoint) character_is_digit(codepoint->character) /** * Macros for shortcuts to call into the `builder_result_push_char` function. */ #define builder_result_push(builder, codepoint) \ builder_result_push_char(builder, codepoint->character, codepoint->size, \ codepoint->encoding) #define builder_result_push_literal(builder, character) \ builder_result_push_char(builder, character, 1, NULL) /** * A struct for keeping track of a codepoint from the original string. Maintains * the original encoding, the actual character codepoint, and the size of the * codepoint. */ typedef struct codepoint { // The encoding of the character rb_encoding *encoding; // Representation of a character, regardless of encoding unsigned int character; // The size that this character actually represents int size; } codepoint_t; /** * A struct for tracking the built string as it gets converted. Maintains an * internal DFA for transitioning through various inputs to match certain * patterns that need to be separated with underscores. * * The internal DFA looks like: * * ┌ - ┐ ┌ * ┐ * │ v │ v * ┌─────────────┐ ┌─────────────┐ * │ │──── : ───>│ │ * ──>│ DEFAULT │<─── : ────│ COLON │ * │ │<─── * ────│ │ * └─────────────┘ └─────────────┘ * │ ^ ^ ^ * │ │ │ └───── a-z ─────────────┐ * 0-9A-Z * │ │ * │ │ └───────── * ────────┐ │ * v │ │ │ * ┌─────────────┐ ┌─────────────┐ * │ │─── A-Z ──>│ │ * │ UPPER_START │ │ UPPER_END │ * │ │<── 0-9 ───│ │ * └─────────────┘ └─────────────┘ * │ ^ ^ │ * └ 0-9 ┘ └ A-Z ┘ * * Transitions from DEFAULT: * - On "-", push an "_" and stay on DEFAULT * - On ":", go to COLON * - On a digit or upper, start a buffer with the char and go to UPPER_START * - On anything else, push the char and stay on DEFAULT * * Transitions from COLON: * - On ":", push a "/" and go to DEFAULT * - On anything else, push a ":" and the char and go to DEFAULT * * Transitions from UPPER_START: * - On a digit, push the digit and stay on UPPER_START * - On an upper, push the upper and go to UPPER_END * - On anything else, push the buffer, go to DEFAULT, then handle the char * * Transitions from UPPER_END: * - On a digit, push the digit onto the buffer and go to UPPER_START * - On an upper, push the upper onto the buffer and stay on UPPER_END * - On a lower, push the buffer up to the last char, push an "_", then push * the last char of the buffer, go to DEFAULT, then handle the char * - On anything else, push the buffer, go to DEFAULT, then handle the char * * These transitions allow us to accomplish the equivalent of the following code * with one pass through the string: * * def underscore(word) * word.gsub!('::', '/') * word.gsub!(/([A-Z\d]+)([A-Z][a-z])/, '\1_\2') * word.gsub!(/([a-z\d])([A-Z])/, '\1_\2') * word.tr!('-', '_') * word.downcase! * end */ typedef struct builder { // The state of the DFA that the builder is in enum state { STATE_DEFAULT, STATE_COLON, STATE_UPPER_END, STATE_UPPER_START } state; // The current segment of text that we're analyzing char *segment; long segment_size; // The resultant text from the underscore operation char *result; long result_size; // Whether or not the last pushed result character should cause the following // one to be spaced by an underscore int push_next; } builder_t; // Push a character onto the resultant string using the given codepoint and // encoding. static void builder_result_push_char(builder_t *builder, unsigned int character, int size, rb_encoding *encoding) { if (character_is_upper(character)) { if (builder->push_next == 1) { builder->push_next = 0; builder_result_push_literal(builder, '_'); } builder->result[builder->result_size++] = (char) character - 'A' + 'a'; return; } builder->push_next = (character_is_lower(character) || character_is_digit(character)); if (encoding == NULL) { builder->result[builder->result_size++] = (char) character; } else { rb_enc_mbcput(character, &builder->result[builder->result_size], encoding); builder->result_size += size; } } // Push the given codepoint onto the builder. static void builder_segment_push(builder_t *builder, codepoint_t *codepoint) { builder->segment[builder->segment_size++] = (char) codepoint->character; } // Copy the given number of characters out of the segment cache onto the result // string. static void builder_segment_copy(builder_t *builder, long size) { long idx; for (idx = 0; idx < size; idx++) { builder_result_push_literal(builder, builder->segment[idx]); } } // Restart the `builder_t` back at the default state (because we've hit a // character for which we have no allowed transitions). static void builder_restart(builder_t *builder) { builder->state = STATE_DEFAULT; builder->segment_size = 0; } static void builder_next(builder_t *builder, codepoint_t *codepoint); // Pull the remaining content out of the cached segment in case we don't end // parsing while not in the default state. static void builder_flush(builder_t *builder) { switch (builder->state) { case STATE_DEFAULT: return; case STATE_COLON: builder_result_push_literal(builder, ':'); return; case STATE_UPPER_END: case STATE_UPPER_START: builder_segment_copy(builder, builder->segment_size); return; } } // Perform transitions from the STATE_DEFAULT state. static inline void builder_default_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint->character == '-') { builder_result_push_literal(builder, '_'); return; } if (codepoint->character == ':') { builder->state = STATE_COLON; return; } if (codepoint_is_digit(codepoint) || codepoint_is_upper(codepoint)) { builder->segment[0] = (char) codepoint->character; builder->segment_size = 1; builder->state = STATE_UPPER_START; return; } builder_result_push(builder, codepoint); } // Perform transitions from the STATE_COLON state. static inline void builder_colon_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint->character == ':') { builder_result_push_literal(builder, '/'); builder_restart(builder); return; } builder_result_push_literal(builder, ':'); builder_restart(builder); builder_next(builder, codepoint); } // Perform transitions from the STATE_UPPER_START state. static inline void builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint_is_digit(codepoint)) { builder_segment_push(builder, codepoint); return; } if (codepoint_is_upper(codepoint)) { builder_segment_push(builder, codepoint); builder->state = STATE_UPPER_END; return; } builder_segment_copy(builder, builder->segment_size); builder_restart(builder); builder_next(builder, codepoint); } // Perform transitions from the STATE_UPPER_END state. static inline void builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) { if (codepoint_is_digit(codepoint)) { builder_segment_push(builder, codepoint); builder->state = STATE_UPPER_START; return; } if (codepoint_is_upper(codepoint)) { builder_segment_push(builder, codepoint); return; } if (codepoint_is_lower(codepoint)) { builder_segment_copy(builder, builder->segment_size - 1); builder_result_push_literal(builder, '_'); builder_result_push_literal(builder, builder->segment[builder->segment_size - 1]); builder_restart(builder); builder_next(builder, codepoint); return; } builder_segment_copy(builder, builder->segment_size); builder_restart(builder); builder_next(builder, codepoint); } // Accept the next codepoint, which will move the `builder_t` struct into the next state. static void builder_next(builder_t *builder, codepoint_t *codepoint) { switch (builder->state) { case STATE_DEFAULT: return builder_default_transition(builder, codepoint); case STATE_COLON: return builder_colon_transition(builder, codepoint); case STATE_UPPER_START: return builder_upper_start_transition(builder, codepoint); case STATE_UPPER_END: return builder_upper_end_transition(builder, codepoint); } } /** * Makes an underscored, lowercase form from the expression in the string. * * Changes '::' to '/' to convert namespaces to paths. * * underscore('ActiveModel') # => "active_model" * underscore('ActiveModel::Errors') # => "active_model/errors" * * As a rule of thumb you can think of +underscore+ as the inverse of * #camelize, though there are cases where that does not hold: * * camelize(underscore('SSLError')) # => "SslError" */ static VALUE underscore(VALUE string) { char segment[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2]; char result[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2]; builder_t builder = { .state = STATE_DEFAULT, .segment = segment, .result = result, .segment_size = 0, .result_size = 0, .push_next = 0 }; codepoint_t codepoint = { .encoding = rb_enc_from_index(ENCODING_GET(string)), .character = 0, .size = 0 }; char *pointer = RSTRING_PTR(string); char *end = RSTRING_END(string); while (pointer < end) { codepoint.character = rb_enc_codepoint_len(pointer, end, &codepoint.size, codepoint.encoding); builder_next(&builder, &codepoint); pointer += codepoint.size; } builder_flush(&builder); return rb_enc_str_new(builder.result, builder.result_size, codepoint.encoding); } // FastUnderscore::underscore static VALUE fast_underscore(VALUE self, VALUE string) { return underscore(string); } // Hook into Ruby and define FastUnderscore::underscore and String#underscore void Init_fast_underscore(void) { VALUE rb_cFastUnderscore = rb_define_module("FastUnderscore"); rb_define_singleton_method(rb_cFastUnderscore, "underscore", fast_underscore, 1); rb_define_method(rb_cString, "underscore", underscore, 0); }