ext/console/console.c in console-0.2.3 vs ext/console/console.c in console-0.3

- old
+ new

@@ -1,63 +1,123 @@ /* - * console.c -- unit tests for ruby Console library + * console.c -- ruby console library * Author: William Morgan (mailto: wmorgan-ruby-console@masanjin.net) * Copyright: Copyright 2010 William Morgan * License: same terms as Ruby itself */ -#define _XOPEN_SOURCE #include <wchar.h> #include <stdlib.h> #include <ruby.h> +#include <locale.h> + +#ifdef HAVE_RUBY_ENCODING_H #include <ruby/encoding.h> +#endif +static inline int calc_width(char* string, long strlen, long byte_offset, size_t* num_bytes, size_t* num_cols) { + wchar_t wc; + size_t width = -1; + + *num_bytes = mbrtowc(&wc, string + byte_offset, strlen - byte_offset, NULL); + + if(*num_bytes == (size_t)-2) { + rb_raise(rb_eArgError, "malformed string: incomplete multibyte character at position %ld", byte_offset); + return -1; + } + else if(*num_bytes == (size_t)-1) { + rb_raise(rb_eArgError, "malformed string: invalid multibyte character at position %ld", byte_offset); + return -1; + } + else if(*num_bytes == 0) { + rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", byte_offset); + return -1; + } + + *num_cols = wcwidth(wc); + /* sometimes this seems to happen! maybe it's not bad... + if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes); + */ + + return 0; +} + /* + * call-seq: init_locale! + * + * Sets the program's current locale from the appropriate environment variables. + * (see `man 3 setlocale` for details). + * + * Equivalent to: + * char* old_locale = setlocale(LC_ALL, NULL); + * return old_locale; + * + * in C. + * + * If you are using Ruby 1.8, you *must* call this at least once before calling + * the other methods in this package. Otherwise, using non-ASCII strings will + * be considered invalid, and #display_width and #display_slice will raise + * ArgumentErrors. + * + * Ruby 1.9 users do not need to call this, since Ruby 1.9 appears to set the + * locale in this manner already. Calling it won't matter, however. + * + * Returns a string representing the old locale. If you wish to change locales + * several times, you can use this value to return to the previous locale. + * Otherwise, just ignore it. + */ + +static VALUE init_locale(VALUE v_self) { + char* old_locale = setlocale(LC_ALL, NULL); + setlocale(LC_ALL, ""); // set ctype locale according to appropriate env vars + + return rb_str_new2(old_locale); +} + +/* * call-seq: display_width(string) - * + * * Returns the display width of <code>string</code>, that is, the number of - * columns that the string will take up when printed to screen. Note that this - * is different from the number of characters (some characters take up one - * column, some (e.g. Chinese characters) take up two columns), and the number - * of bytes (e.g. UTF-8 is a multibyte encoding) in a string. + * columns that the string will take up when printed to screen. This is + * different from both the number of characters and the number of bytes in a + * string. * - * For Ruby 1.8, the string is assumed to be in the current locale's encoding. - * If not, terrible things will happen. + * In Ruby 1.8, the input string is assumed to be in the current locale's + * encoding. If it isn't, an ArgumentError will be raised. Be sure to call + * init_locale! before calling this method! Otherwise every non-ASCII string + * will trigger an ArgumentError. * - * For Ruby 1.9, the string will be magically converted from whatever encoding - * it is in, into the current locale's encoding, for processing. This may fail. + * In Ruby 1.9, the string will be automatically converted from its encoding + * into the current locale's encoding for processing. + * + * Throws an ArgumentError when it encounters an invalid character. On Ruby + * 1.8, this includes any string not in the current locale's encoding. On Ruby + * 1.9, this should only occur if Ruby is unable to convert the string from its + * encoding into the current locale's encoding. */ static VALUE display_width(VALUE v_self, VALUE v_string) { Check_Type(v_string, T_STRING); - /* for ruby 1.8, we assume the string is in your locale's CTYPE encoding - * already. if not, terrible things will happen. - * - * for ruby 1.9, we explicitly convert it to the locale's CTYPE encoding, - * like this: - */ #ifdef HAVE_RUBY_ENCODING_H + // convert from whatever encoding it's in. + // TODO: do i have to use rb_protect to relay any exceptions? v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil); #endif + char* string = RSTRING_PTR(v_string); - mbstate_t mbs; memset(&mbs, 0, sizeof(mbs)); long display_width = 0; - long remaining_bytes = RSTRING_LEN(v_string); - while(remaining_bytes > 0) { - wchar_t wc; - size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs); - if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes); + long strlen = RSTRING_LEN(v_string); + long offset = 0; - int width = wcwidth(wc); - /* sometimes this seems to happen! maybe it's not bad... - if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes); - */ + while(offset < strlen) { + size_t num_bytes, num_cols; + int err = calc_width(string, strlen, offset, &num_bytes, &num_cols); + if(err) break; - display_width += width; - remaining_bytes -= num_bytes; - string += num_bytes; // advance string pointer + display_width += num_cols; + offset += num_bytes; } return LONG2NUM(display_width); } @@ -65,33 +125,38 @@ /* * call-seq: * display_slice(string, start_offset, display_width=1, pad_string=" ") * - * Returns a slice of a string based on display width, rather than character or - * bytes. I.e, the <code>start_offset</code> and <code>display_width</code> - * offsets index the columns required to display the string, and not individual + * Returns a slice of a string, based on display width, rather than character + * or bytes. I.e, the <code>start_offset</code> and <code>display_width</code> + * offsets index the columns required to display the string, not individual * characters or bytes. * * This is useful if you want to display a part of a string on screen, as you - * can pull out a specific portion by its display size. + * can pull out a specific portion based on display size. * * Padding: slicing can truncate multi-column characters. If the slice * truncates a character, the string will be padded with * <code>pad_string</code>, on the left side, right side, or both, as * necessary. If <code>pad_string</code> is <code>nil</code> then no padding - * will be done. <code>pad_string</code> should be a single-column - * string for this to make sense. + * will be done. <code>pad_string</code> should be a single-column string for + * this to make sense. * - * For Ruby 1.8, the string is assumed to be in the current locale's encoding. - * If not, terrible things will happen. + * In Ruby 1.8, the input string is assumed to be in the current locale's + * encoding. If it isn't, an ArgumentError will be raised. Be sure to call + * init_locale! before calling this method! Otherwise every non-ASCII string + * will trigger an ArgumentError. * - * For Ruby 1.9, the string will be magically converted from whatever encoding - * it is in, into the current locale's encoding, for processing. This may fail. + * In Ruby 1.9, the string will be automatically converted from its encoding + * into the current locale's encoding. Regardless of the original encoding, the + * returned string will be in the current locale's encoding. * - * The returned string WILL be in the current locale encoding, regardless of the - * encoding of the original string. + * Throws an ArgumentError when it encounters an invalid character. On Ruby + * 1.8, this includes any string not in the current locale's encoding. On Ruby + * 1.9, this should only occur if Ruby is unable to convert the string from its + * encoding into the current locale's encoding. */ static VALUE display_slice(int argc, VALUE *argv, VALUE v_self) { VALUE v_string, v_display_start, v_display_width, v_pad_string; rb_scan_args(argc, argv, "22", &v_string, &v_display_start, &v_display_width, &v_pad_string); Check_Type(v_string, T_STRING); @@ -104,106 +169,101 @@ int display_width; if(argc < 3) display_width = 1; // default value just like String#slice (although it makes slightly less sense) else display_width = NUM2INT(v_display_width); if(display_width < 0) return Qnil; // you fail - char* pad_string; + const char* pad_string; if(argc < 4) pad_string = default_pad_string; // only fill in default if unspecified; nil is a valid value else if(v_pad_string == Qnil) pad_string = ""; else pad_string = RSTRING_PTR(v_pad_string); - /* see comments in display_width() */ #ifdef HAVE_RUBY_ENCODING_H + // TODO: do i have to use rb_protect to relay any exceptions? v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil); #endif char* string = RSTRING_PTR(v_string); + long slen = RSTRING_LEN(v_string); - mbstate_t mbs; memset(&mbs, 0, sizeof(mbs)); - long remaining_bytes = RSTRING_LEN(v_string); - // first, advance the string pointer so that we've seen display_start width characters long current_width = 0; - while((remaining_bytes > 0) && (current_width < display_start)) { - wchar_t wc; - size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs); - if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes); + long offset = 0; + while((offset < slen) && (current_width < display_start)) { + size_t num_bytes, num_cols; + int err = calc_width(string, slen, offset, &num_bytes, &num_cols); - int width = wcwidth(wc); - /* sometimes this seems to happen! maybe it's not bad... - if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes); - */ - - current_width += width; - remaining_bytes -= num_bytes; - string += num_bytes; // advance string pointer + current_width += num_cols; + offset += num_bytes; } /* here's a weird behavior (to me!) of String#slice that we emulate: * if the start point is the string length itself, you get an empty * string back; if the start point is greater than that, you get nil. */ if((current_width < display_start)) return Qnil; /* determine left padding */ - char* pad_left = ""; + const char* pad_left = ""; if((current_width > display_start) && (display_width > 0)) pad_left = pad_string; // now, advance the string_end pointer so that we've seen an additional display_width width characters - char* string_end = string; + long end_offset = offset; current_width -= display_start; - while((remaining_bytes > 0) && (current_width < display_width)) { - wchar_t wc; - size_t num_bytes = mbrtowc(&wc, string_end, remaining_bytes, &mbs); - if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes); + while((end_offset < slen) && (current_width < display_width)) { + size_t num_bytes, num_cols; + int err = calc_width(string, slen, end_offset, &num_bytes, &num_cols); - int width = wcwidth(wc); - /* sometimes this seems to happen! maybe it's not bad... - if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes); - */ + if((current_width + num_cols) > (size_t)display_width) break; // have to stop here - if((current_width + width) > display_width) break; // have to stop here - - current_width += width; - remaining_bytes -= num_bytes; - string_end += num_bytes; // advance string pointer + current_width += num_cols; + end_offset += num_bytes; } /* determine right padding */ - char* pad_right = ""; - if((current_width < display_width) && (remaining_bytes > 0)) pad_right = pad_string; + const char* pad_right = ""; + if((current_width < display_width) && (end_offset < slen)) pad_right = pad_string; // finally, construct a new string - int bytesize = string_end - string; + int bytesize = end_offset - offset; int leftsize = strlen(pad_left); int rightsize = strlen(pad_right); char* new_string = calloc(bytesize + leftsize + rightsize + 1, sizeof(char)); if(leftsize > 0) strcpy(new_string, pad_left); - if(bytesize > 0) memcpy(new_string + leftsize, string, bytesize * sizeof(char)); + if(bytesize > 0) memcpy(new_string + leftsize, string + offset, bytesize * sizeof(char)); if(rightsize > 0) strcpy(new_string + leftsize + bytesize, pad_right); - (new_string + bytesize + leftsize + rightsize)[0] = 0; + (new_string + bytesize + leftsize + rightsize)[0] = 0; +#ifdef HAVE_RUBY_ENCODING_H return rb_enc_str_new(new_string, bytesize + leftsize + rightsize, rb_enc_get(v_string)); +#else + return rb_str_new(new_string, bytesize + leftsize + rightsize); +#endif } /* * A helper class for console-based programs that need to deal with non-ASCII * code. If you are writing a curses/ncurses program, or otherwise care about - * the number of characters on the screen, this is crucial stuff. + * the display width of characters on the screen, this is crucial stuff. * * Provides: * - * Console.display_width: get the number of display columns used by a string. + * Console.init_locale!: set the program's locale from the appropriate + * environment variables. (Ruby 1.8 programs must call this before calling any + * of the other methods. Ruby 1.9 programs can call it or skip it without + * effect.) * - * Console.display_slice: get a substrig by display column offset and size. + * Console.display_width: calculates the display width of a string * + * Console.display_slice: returns a substring according to display offset + * and display width parameters. + * */ void Init_console() { VALUE cConsole; cConsole = rb_define_class("Console", rb_cObject); rb_define_module_function(cConsole, "display_width", display_width, 1); rb_define_module_function(cConsole, "display_slice", display_slice, -1); + rb_define_module_function(cConsole, "init_locale!", init_locale, 0); } -