ext/console/console.c in console-0.2.3 vs ext/console/console.c in console-0.3
- old
+ new
@@ -1,63 +1,123 @@
/*
- * console.c -- unit tests for ruby Console library
+ * console.c -- ruby console library
* Author: William Morgan (mailto: wmorgan-ruby-console@masanjin.net)
* Copyright: Copyright 2010 William Morgan
* License: same terms as Ruby itself
*/
-#define _XOPEN_SOURCE
#include <wchar.h>
#include <stdlib.h>
#include <ruby.h>
+#include <locale.h>
+
+#ifdef HAVE_RUBY_ENCODING_H
#include <ruby/encoding.h>
+#endif
+static inline int calc_width(char* string, long strlen, long byte_offset, size_t* num_bytes, size_t* num_cols) {
+ wchar_t wc;
+ size_t width = -1;
+
+ *num_bytes = mbrtowc(&wc, string + byte_offset, strlen - byte_offset, NULL);
+
+ if(*num_bytes == (size_t)-2) {
+ rb_raise(rb_eArgError, "malformed string: incomplete multibyte character at position %ld", byte_offset);
+ return -1;
+ }
+ else if(*num_bytes == (size_t)-1) {
+ rb_raise(rb_eArgError, "malformed string: invalid multibyte character at position %ld", byte_offset);
+ return -1;
+ }
+ else if(*num_bytes == 0) {
+ rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", byte_offset);
+ return -1;
+ }
+
+ *num_cols = wcwidth(wc);
+ /* sometimes this seems to happen! maybe it's not bad...
+ if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+ */
+
+ return 0;
+}
+
/*
+ * call-seq: init_locale!
+ *
+ * Sets the program's current locale from the appropriate environment variables.
+ * (see `man 3 setlocale` for details).
+ *
+ * Equivalent to:
+ * char* old_locale = setlocale(LC_ALL, NULL);
+ * return old_locale;
+ *
+ * in C.
+ *
+ * If you are using Ruby 1.8, you *must* call this at least once before calling
+ * the other methods in this package. Otherwise, using non-ASCII strings will
+ * be considered invalid, and #display_width and #display_slice will raise
+ * ArgumentErrors.
+ *
+ * Ruby 1.9 users do not need to call this, since Ruby 1.9 appears to set the
+ * locale in this manner already. Calling it won't matter, however.
+ *
+ * Returns a string representing the old locale. If you wish to change locales
+ * several times, you can use this value to return to the previous locale.
+ * Otherwise, just ignore it.
+ */
+
+static VALUE init_locale(VALUE v_self) {
+ char* old_locale = setlocale(LC_ALL, NULL);
+ setlocale(LC_ALL, ""); // set ctype locale according to appropriate env vars
+
+ return rb_str_new2(old_locale);
+}
+
+/*
* call-seq: display_width(string)
- *
+ *
* Returns the display width of <code>string</code>, that is, the number of
- * columns that the string will take up when printed to screen. Note that this
- * is different from the number of characters (some characters take up one
- * column, some (e.g. Chinese characters) take up two columns), and the number
- * of bytes (e.g. UTF-8 is a multibyte encoding) in a string.
+ * columns that the string will take up when printed to screen. This is
+ * different from both the number of characters and the number of bytes in a
+ * string.
*
- * For Ruby 1.8, the string is assumed to be in the current locale's encoding.
- * If not, terrible things will happen.
+ * In Ruby 1.8, the input string is assumed to be in the current locale's
+ * encoding. If it isn't, an ArgumentError will be raised. Be sure to call
+ * init_locale! before calling this method! Otherwise every non-ASCII string
+ * will trigger an ArgumentError.
*
- * For Ruby 1.9, the string will be magically converted from whatever encoding
- * it is in, into the current locale's encoding, for processing. This may fail.
+ * In Ruby 1.9, the string will be automatically converted from its encoding
+ * into the current locale's encoding for processing.
+ *
+ * Throws an ArgumentError when it encounters an invalid character. On Ruby
+ * 1.8, this includes any string not in the current locale's encoding. On Ruby
+ * 1.9, this should only occur if Ruby is unable to convert the string from its
+ * encoding into the current locale's encoding.
*/
static VALUE display_width(VALUE v_self, VALUE v_string) {
Check_Type(v_string, T_STRING);
- /* for ruby 1.8, we assume the string is in your locale's CTYPE encoding
- * already. if not, terrible things will happen.
- *
- * for ruby 1.9, we explicitly convert it to the locale's CTYPE encoding,
- * like this:
- */
#ifdef HAVE_RUBY_ENCODING_H
+ // convert from whatever encoding it's in.
+ // TODO: do i have to use rb_protect to relay any exceptions?
v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil);
#endif
+
char* string = RSTRING_PTR(v_string);
- mbstate_t mbs; memset(&mbs, 0, sizeof(mbs));
long display_width = 0;
- long remaining_bytes = RSTRING_LEN(v_string);
- while(remaining_bytes > 0) {
- wchar_t wc;
- size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs);
- if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+ long strlen = RSTRING_LEN(v_string);
+ long offset = 0;
- int width = wcwidth(wc);
- /* sometimes this seems to happen! maybe it's not bad...
- if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
- */
+ while(offset < strlen) {
+ size_t num_bytes, num_cols;
+ int err = calc_width(string, strlen, offset, &num_bytes, &num_cols);
+ if(err) break;
- display_width += width;
- remaining_bytes -= num_bytes;
- string += num_bytes; // advance string pointer
+ display_width += num_cols;
+ offset += num_bytes;
}
return LONG2NUM(display_width);
}
@@ -65,33 +125,38 @@
/*
* call-seq:
* display_slice(string, start_offset, display_width=1, pad_string=" ")
*
- * Returns a slice of a string based on display width, rather than character or
- * bytes. I.e, the <code>start_offset</code> and <code>display_width</code>
- * offsets index the columns required to display the string, and not individual
+ * Returns a slice of a string, based on display width, rather than character
+ * or bytes. I.e, the <code>start_offset</code> and <code>display_width</code>
+ * offsets index the columns required to display the string, not individual
* characters or bytes.
*
* This is useful if you want to display a part of a string on screen, as you
- * can pull out a specific portion by its display size.
+ * can pull out a specific portion based on display size.
*
* Padding: slicing can truncate multi-column characters. If the slice
* truncates a character, the string will be padded with
* <code>pad_string</code>, on the left side, right side, or both, as
* necessary. If <code>pad_string</code> is <code>nil</code> then no padding
- * will be done. <code>pad_string</code> should be a single-column
- * string for this to make sense.
+ * will be done. <code>pad_string</code> should be a single-column string for
+ * this to make sense.
*
- * For Ruby 1.8, the string is assumed to be in the current locale's encoding.
- * If not, terrible things will happen.
+ * In Ruby 1.8, the input string is assumed to be in the current locale's
+ * encoding. If it isn't, an ArgumentError will be raised. Be sure to call
+ * init_locale! before calling this method! Otherwise every non-ASCII string
+ * will trigger an ArgumentError.
*
- * For Ruby 1.9, the string will be magically converted from whatever encoding
- * it is in, into the current locale's encoding, for processing. This may fail.
+ * In Ruby 1.9, the string will be automatically converted from its encoding
+ * into the current locale's encoding. Regardless of the original encoding, the
+ * returned string will be in the current locale's encoding.
*
- * The returned string WILL be in the current locale encoding, regardless of the
- * encoding of the original string.
+ * Throws an ArgumentError when it encounters an invalid character. On Ruby
+ * 1.8, this includes any string not in the current locale's encoding. On Ruby
+ * 1.9, this should only occur if Ruby is unable to convert the string from its
+ * encoding into the current locale's encoding.
*/
static VALUE display_slice(int argc, VALUE *argv, VALUE v_self) {
VALUE v_string, v_display_start, v_display_width, v_pad_string;
rb_scan_args(argc, argv, "22", &v_string, &v_display_start, &v_display_width, &v_pad_string);
Check_Type(v_string, T_STRING);
@@ -104,106 +169,101 @@
int display_width;
if(argc < 3) display_width = 1; // default value just like String#slice (although it makes slightly less sense)
else display_width = NUM2INT(v_display_width);
if(display_width < 0) return Qnil; // you fail
- char* pad_string;
+ const char* pad_string;
if(argc < 4) pad_string = default_pad_string; // only fill in default if unspecified; nil is a valid value
else if(v_pad_string == Qnil) pad_string = "";
else pad_string = RSTRING_PTR(v_pad_string);
- /* see comments in display_width() */
#ifdef HAVE_RUBY_ENCODING_H
+ // TODO: do i have to use rb_protect to relay any exceptions?
v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil);
#endif
char* string = RSTRING_PTR(v_string);
+ long slen = RSTRING_LEN(v_string);
- mbstate_t mbs; memset(&mbs, 0, sizeof(mbs));
- long remaining_bytes = RSTRING_LEN(v_string);
-
// first, advance the string pointer so that we've seen display_start width characters
long current_width = 0;
- while((remaining_bytes > 0) && (current_width < display_start)) {
- wchar_t wc;
- size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs);
- if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+ long offset = 0;
+ while((offset < slen) && (current_width < display_start)) {
+ size_t num_bytes, num_cols;
+ int err = calc_width(string, slen, offset, &num_bytes, &num_cols);
- int width = wcwidth(wc);
- /* sometimes this seems to happen! maybe it's not bad...
- if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
- */
-
- current_width += width;
- remaining_bytes -= num_bytes;
- string += num_bytes; // advance string pointer
+ current_width += num_cols;
+ offset += num_bytes;
}
/* here's a weird behavior (to me!) of String#slice that we emulate:
* if the start point is the string length itself, you get an empty
* string back; if the start point is greater than that, you get nil.
*/
if((current_width < display_start)) return Qnil;
/* determine left padding */
- char* pad_left = "";
+ const char* pad_left = "";
if((current_width > display_start) && (display_width > 0)) pad_left = pad_string;
// now, advance the string_end pointer so that we've seen an additional display_width width characters
- char* string_end = string;
+ long end_offset = offset;
current_width -= display_start;
- while((remaining_bytes > 0) && (current_width < display_width)) {
- wchar_t wc;
- size_t num_bytes = mbrtowc(&wc, string_end, remaining_bytes, &mbs);
- if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+ while((end_offset < slen) && (current_width < display_width)) {
+ size_t num_bytes, num_cols;
+ int err = calc_width(string, slen, end_offset, &num_bytes, &num_cols);
- int width = wcwidth(wc);
- /* sometimes this seems to happen! maybe it's not bad...
- if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
- */
+ if((current_width + num_cols) > (size_t)display_width) break; // have to stop here
- if((current_width + width) > display_width) break; // have to stop here
-
- current_width += width;
- remaining_bytes -= num_bytes;
- string_end += num_bytes; // advance string pointer
+ current_width += num_cols;
+ end_offset += num_bytes;
}
/* determine right padding */
- char* pad_right = "";
- if((current_width < display_width) && (remaining_bytes > 0)) pad_right = pad_string;
+ const char* pad_right = "";
+ if((current_width < display_width) && (end_offset < slen)) pad_right = pad_string;
// finally, construct a new string
- int bytesize = string_end - string;
+ int bytesize = end_offset - offset;
int leftsize = strlen(pad_left);
int rightsize = strlen(pad_right);
char* new_string = calloc(bytesize + leftsize + rightsize + 1, sizeof(char));
if(leftsize > 0) strcpy(new_string, pad_left);
- if(bytesize > 0) memcpy(new_string + leftsize, string, bytesize * sizeof(char));
+ if(bytesize > 0) memcpy(new_string + leftsize, string + offset, bytesize * sizeof(char));
if(rightsize > 0) strcpy(new_string + leftsize + bytesize, pad_right);
- (new_string + bytesize + leftsize + rightsize)[0] = 0;
+ (new_string + bytesize + leftsize + rightsize)[0] = 0;
+#ifdef HAVE_RUBY_ENCODING_H
return rb_enc_str_new(new_string, bytesize + leftsize + rightsize, rb_enc_get(v_string));
+#else
+ return rb_str_new(new_string, bytesize + leftsize + rightsize);
+#endif
}
/*
* A helper class for console-based programs that need to deal with non-ASCII
* code. If you are writing a curses/ncurses program, or otherwise care about
- * the number of characters on the screen, this is crucial stuff.
+ * the display width of characters on the screen, this is crucial stuff.
*
* Provides:
*
- * Console.display_width: get the number of display columns used by a string.
+ * Console.init_locale!: set the program's locale from the appropriate
+ * environment variables. (Ruby 1.8 programs must call this before calling any
+ * of the other methods. Ruby 1.9 programs can call it or skip it without
+ * effect.)
*
- * Console.display_slice: get a substrig by display column offset and size.
+ * Console.display_width: calculates the display width of a string
*
+ * Console.display_slice: returns a substring according to display offset
+ * and display width parameters.
+ *
*/
void Init_console() {
VALUE cConsole;
cConsole = rb_define_class("Console", rb_cObject);
rb_define_module_function(cConsole, "display_width", display_width, 1);
rb_define_module_function(cConsole, "display_slice", display_slice, -1);
+ rb_define_module_function(cConsole, "init_locale!", init_locale, 0);
}
-