console.c in console-0.3

- old
+ new

@@ -1,63 +1,123 @@
 /*
- * console.c -- unit tests for ruby Console library
+ * console.c -- ruby console library
  * Author:     William Morgan (mailto: wmorgan-ruby-console@masanjin.net)
  * Copyright:  Copyright 2010 William Morgan
  * License:    same terms as Ruby itself
  */
 
-#define _XOPEN_SOURCE
 #include <wchar.h>
 #include <stdlib.h>
 #include <ruby.h>
+#include <locale.h>
+
+#ifdef HAVE_RUBY_ENCODING_H
 #include <ruby/encoding.h>
+#endif
 
+static inline int calc_width(char* string, long strlen, long byte_offset, size_t* num_bytes, size_t* num_cols) {
+  wchar_t wc;
+  size_t width = -1;
+
+  *num_bytes = mbrtowc(&wc, string + byte_offset, strlen - byte_offset, NULL);
+
+  if(*num_bytes == (size_t)-2) {
+    rb_raise(rb_eArgError, "malformed string: incomplete multibyte character at position %ld", byte_offset);
+    return -1;
+  }
+  else if(*num_bytes == (size_t)-1) {
+    rb_raise(rb_eArgError, "malformed string: invalid multibyte character at position %ld", byte_offset);
+    return -1;
+  }
+  else if(*num_bytes == 0) {
+    rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", byte_offset);
+    return -1;
+  }
+
+  *num_cols = wcwidth(wc);
+  /* sometimes this seems to happen! maybe it's not bad...
+  if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+  */
+
+  return 0;
+}
+
 /*
+ * call-seq: init_locale!
+ *
+ * Sets the program's current locale from the appropriate environment variables.
+ * (see `man 3 setlocale` for details).
+ *
+ * Equivalent to:
+ *   char* old_locale = setlocale(LC_ALL, NULL);
+ *   return old_locale;
+ *
+ * in C.
+ *
+ * If you are using Ruby 1.8, you *must* call this at least once before calling
+ * the other methods in this package. Otherwise, using non-ASCII strings will
+ * be considered invalid, and #display_width and #display_slice will raise
+ * ArgumentErrors.
+ *
+ * Ruby 1.9 users do not need to call this, since Ruby 1.9 appears to set the
+ * locale in this manner already. Calling it won't matter, however.
+ *
+ * Returns a string representing the old locale. If you wish to change locales
+ * several times, you can use this value to return to the previous locale.
+ * Otherwise, just ignore it.
+ */
+
+static VALUE init_locale(VALUE v_self) {
+  char* old_locale = setlocale(LC_ALL, NULL);
+  setlocale(LC_ALL, ""); // set ctype locale according to appropriate env vars
+
+  return rb_str_new2(old_locale);
+}
+
+/*
  * call-seq: display_width(string)
- * 
+ *
  * Returns the display width of <code>string</code>, that is, the number of
- * columns that the string will take up when printed to screen. Note that this
- * is different from the number of characters (some characters take up one
- * column, some (e.g. Chinese characters) take up two columns), and the number
- * of bytes (e.g. UTF-8 is a multibyte encoding) in a string.
+ * columns that the string will take up when printed to screen. This is
+ * different from both the number of characters and the number of bytes in a
+ * string.
  *
- * For Ruby 1.8, the string is assumed to be in the current locale's encoding.
- * If not, terrible things will happen.
+ * In Ruby 1.8, the input string is assumed to be in the current locale's
+ * encoding.  If it isn't, an ArgumentError will be raised. Be sure to call
+ * init_locale! before calling this method! Otherwise every non-ASCII string
+ * will trigger an ArgumentError.
  *
- * For Ruby 1.9, the string will be magically converted from whatever encoding
- * it is in, into the current locale's encoding, for processing. This may fail.
+ * In Ruby 1.9, the string will be automatically converted from its encoding
+ * into the current locale's encoding for processing.
+ *
+ * Throws an ArgumentError when it encounters an invalid character. On Ruby
+ * 1.8, this includes any string not in the current locale's encoding. On Ruby
+ * 1.9, this should only occur if Ruby is unable to convert the string from its
+ * encoding into the current locale's encoding.
  */
 static VALUE display_width(VALUE v_self, VALUE v_string) {
   Check_Type(v_string, T_STRING);
 
-  /* for ruby 1.8, we assume the string is in your locale's CTYPE encoding
-   * already. if not, terrible things will happen.
-   *
-   * for ruby 1.9, we explicitly convert it to the locale's CTYPE encoding,
-   * like this:
-   */
 #ifdef HAVE_RUBY_ENCODING_H
+  // convert from whatever encoding it's in.
+  // TODO: do i have to use rb_protect to relay any exceptions?
   v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil);
 #endif
+
   char* string = RSTRING_PTR(v_string);
 
-  mbstate_t mbs; memset(&mbs, 0, sizeof(mbs));
   long display_width = 0;
-  long remaining_bytes = RSTRING_LEN(v_string);
-  while(remaining_bytes > 0) {
-    wchar_t wc;
-    size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs);
-    if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+  long strlen = RSTRING_LEN(v_string);
+  long offset = 0;
 
-    int width = wcwidth(wc);
-    /* sometimes this seems to happen! maybe it's not bad...
-    if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
-    */
+  while(offset < strlen) {
+    size_t num_bytes, num_cols;
+    int err = calc_width(string, strlen, offset, &num_bytes, &num_cols);
+    if(err) break;
 
-    display_width += width;
-    remaining_bytes -= num_bytes;
-    string += num_bytes; // advance string pointer
+    display_width += num_cols;
+    offset += num_bytes;
   }
 
   return LONG2NUM(display_width);
 }
 
@@ -65,33 +125,38 @@
 
 /*
  * call-seq:
  *   display_slice(string, start_offset, display_width=1, pad_string=" ")
  *
- * Returns a slice of a string based on display width, rather than character or
- * bytes. I.e, the <code>start_offset</code> and <code>display_width</code>
- * offsets index the columns required to display the string, and not individual
+ * Returns a slice of a string, based on display width, rather than character
+ * or bytes. I.e, the <code>start_offset</code> and <code>display_width</code>
+ * offsets index the columns required to display the string, not individual
  * characters or bytes.
  *
  * This is useful if you want to display a part of a string on screen, as you
- * can pull out a specific portion by its display size.
+ * can pull out a specific portion based on display size.
  *
  * Padding: slicing can truncate multi-column characters. If the slice
  * truncates a character, the string will be padded with
  * <code>pad_string</code>, on the left side, right side, or both, as
  * necessary. If <code>pad_string</code> is <code>nil</code> then no padding
- * will be done. <code>pad_string</code> should be a single-column
- * string for this to make sense.
+ * will be done. <code>pad_string</code> should be a single-column string for
+ * this to make sense.
  *
- * For Ruby 1.8, the string is assumed to be in the current locale's encoding.
- * If not, terrible things will happen.
+ * In Ruby 1.8, the input string is assumed to be in the current locale's
+ * encoding.  If it isn't, an ArgumentError will be raised. Be sure to call
+ * init_locale! before calling this method! Otherwise every non-ASCII string
+ * will trigger an ArgumentError.
  *
- * For Ruby 1.9, the string will be magically converted from whatever encoding
- * it is in, into the current locale's encoding, for processing. This may fail.
+ * In Ruby 1.9, the string will be automatically converted from its encoding
+ * into the current locale's encoding. Regardless of the original encoding, the
+ * returned string will be in the current locale's encoding.
  *
- * The returned string WILL be in the current locale encoding, regardless of the
- * encoding of the original string.
+ * Throws an ArgumentError when it encounters an invalid character. On Ruby
+ * 1.8, this includes any string not in the current locale's encoding. On Ruby
+ * 1.9, this should only occur if Ruby is unable to convert the string from its
+ * encoding into the current locale's encoding.
  */
 static VALUE display_slice(int argc, VALUE *argv, VALUE v_self) {
   VALUE v_string, v_display_start, v_display_width, v_pad_string;
   rb_scan_args(argc, argv, "22", &v_string, &v_display_start, &v_display_width, &v_pad_string);
   Check_Type(v_string, T_STRING);
@@ -104,106 +169,101 @@
   int display_width;
   if(argc < 3) display_width = 1; // default value just like String#slice (although it makes slightly less sense)
   else display_width = NUM2INT(v_display_width);
   if(display_width < 0) return Qnil; // you fail
 
-  char* pad_string;
+  const char* pad_string;
   if(argc < 4) pad_string = default_pad_string; // only fill in default if unspecified; nil is a valid value
   else if(v_pad_string == Qnil) pad_string = "";
   else pad_string = RSTRING_PTR(v_pad_string);
 
-  /* see comments in display_width() */
 #ifdef HAVE_RUBY_ENCODING_H
+  // TODO: do i have to use rb_protect to relay any exceptions?
   v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil);
 #endif
   char* string = RSTRING_PTR(v_string);
+  long slen = RSTRING_LEN(v_string);
 
-  mbstate_t mbs; memset(&mbs, 0, sizeof(mbs));
-  long remaining_bytes = RSTRING_LEN(v_string);
-
   // first, advance the string pointer so that we've seen display_start width characters
   long current_width = 0;
-  while((remaining_bytes > 0) && (current_width < display_start)) {
-    wchar_t wc;
-    size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs);
-    if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+  long offset = 0;
+  while((offset < slen) && (current_width < display_start)) {
+    size_t num_bytes, num_cols;
+    int err = calc_width(string, slen, offset, &num_bytes, &num_cols);
 
-    int width = wcwidth(wc);
-    /* sometimes this seems to happen! maybe it's not bad...
-    if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
-    */
-
-    current_width += width;
-    remaining_bytes -= num_bytes;
-    string += num_bytes; // advance string pointer
+    current_width += num_cols;
+    offset += num_bytes;
   }
 
   /* here's a weird behavior (to me!) of String#slice that we emulate:
    * if the start point is the string length itself, you get an empty
    * string back; if the start point is greater than that, you get nil.
    */
   if((current_width < display_start)) return Qnil;
 
   /* determine left padding */
-  char* pad_left = "";
+  const char* pad_left = "";
   if((current_width > display_start) && (display_width > 0)) pad_left = pad_string;
 
   // now, advance the string_end pointer so that we've seen an additional display_width width characters
-  char* string_end = string;
+  long end_offset = offset;
   current_width -= display_start;
-  while((remaining_bytes > 0) && (current_width < display_width)) {
-    wchar_t wc;
-    size_t num_bytes = mbrtowc(&wc, string_end, remaining_bytes, &mbs);
-    if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
+  while((end_offset < slen) && (current_width < display_width)) {
+    size_t num_bytes, num_cols;
+    int err = calc_width(string, slen, end_offset, &num_bytes, &num_cols);
 
-    int width = wcwidth(wc);
-    /* sometimes this seems to happen! maybe it's not bad...
-    if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character at position %ld", RSTRING_LEN(v_string) - remaining_bytes);
-    */
+    if((current_width + num_cols) > (size_t)display_width) break; // have to stop here
 
-    if((current_width + width) > display_width) break; // have to stop here
-
-    current_width += width;
-    remaining_bytes -= num_bytes;
-    string_end += num_bytes; // advance string pointer
+    current_width += num_cols;
+    end_offset += num_bytes;
   }
 
   /* determine right padding */
-  char* pad_right = "";
-  if((current_width < display_width) && (remaining_bytes > 0)) pad_right = pad_string;
+  const char* pad_right = "";
+  if((current_width < display_width) && (end_offset < slen)) pad_right = pad_string;
 
   // finally, construct a new string
-  int bytesize = string_end - string;
+  int bytesize = end_offset - offset;
   int leftsize = strlen(pad_left);
   int rightsize = strlen(pad_right);
 
   char* new_string = calloc(bytesize + leftsize + rightsize + 1, sizeof(char));
   if(leftsize > 0) strcpy(new_string, pad_left);
-  if(bytesize > 0) memcpy(new_string + leftsize, string, bytesize * sizeof(char));
+  if(bytesize > 0) memcpy(new_string + leftsize, string + offset, bytesize * sizeof(char));
   if(rightsize > 0) strcpy(new_string + leftsize + bytesize, pad_right);
 
-  (new_string + bytesize + leftsize + rightsize)[0] = 0; 
+  (new_string + bytesize + leftsize + rightsize)[0] = 0;
 
+#ifdef HAVE_RUBY_ENCODING_H
   return rb_enc_str_new(new_string, bytesize + leftsize + rightsize, rb_enc_get(v_string));
+#else
+  return rb_str_new(new_string, bytesize + leftsize + rightsize);
+#endif
 }
 
 /*
  * A helper class for console-based programs that need to deal with non-ASCII
  * code. If you are writing a curses/ncurses program, or otherwise care about
- * the number of characters on the screen, this is crucial stuff.
+ * the display width of characters on the screen, this is crucial stuff.
  *
  * Provides:
  *
- * Console.display_width: get the number of display columns used by a string.
+ * Console.init_locale!: set the program's locale from the appropriate
+ * environment variables. (Ruby 1.8 programs must call this before calling any
+ * of the other methods. Ruby 1.9 programs can call it or skip it without
+ * effect.)
  *
- * Console.display_slice: get a substrig by display column offset and size.
+ * Console.display_width: calculates the display width of a string
  *
+ * Console.display_slice: returns a substring according to display offset
+ * and display width parameters.
+ *
  */
 
 void Init_console() {
   VALUE cConsole;
 
   cConsole = rb_define_class("Console", rb_cObject);
   rb_define_module_function(cConsole, "display_width", display_width, 1);
   rb_define_module_function(cConsole, "display_slice", display_slice, -1);
+  rb_define_module_function(cConsole, "init_locale!", init_locale, 0);
 }
-