/* * console.c -- unit tests for ruby Console library * Author: William Morgan (mailto: wmorgan-ruby-console@masanjin.net) * Copyright: Copyright 2010 William Morgan * License: same terms as Ruby itself */ #define _XOPEN_SOURCE #include #include #include #include /* * call-seq: display_width(string) * * Returns the display width of string, that is, the number of * columns that the string will take up when printed to screen. Note that this * is different from the number of characters (some characters take up one * column, some (e.g. Chinese characters) take up two columns), and the number * of bytes (e.g. UTF-8 is a multibyte encoding) in a string. * * For Ruby 1.8, the string is assumed to be in the current locale's encoding. * If not, terrible things will happen. * * For Ruby 1.9, the string will be magically converted from whatever encoding * it is in, into the current locale's encoding, for processing. This may fail. */ static VALUE display_width(VALUE v_self, VALUE v_string) { Check_Type(v_string, T_STRING); /* for ruby 1.8, we assume the string is in your locale's CTYPE encoding * already. if not, terrible things will happen. * * for ruby 1.9, we explicitly convert it to the locale's CTYPE encoding, * like this: */ #ifdef HAVE_RUBY_ENCODING_H v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil); #endif char* string = RSTRING_PTR(v_string); mbstate_t mbs; memset(&mbs, 0, sizeof(mbs)); long display_width = 0; long remaining_bytes = RSTRING_LEN(v_string); while(remaining_bytes > 0) { wchar_t wc; size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs); if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes); int width = wcwidth(wc); if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character"); display_width += width; remaining_bytes -= num_bytes; string += num_bytes; // advance string pointer } return LONG2NUM(display_width); } static const char* default_pad_string = " "; /* * call-seq: * slice(string, start_offset, display_width=1, pad_string=" ") * * Returns a slice of a string based on display width, rather than character or * bytes. I.e, the start_offset and display_width * offsets index the columns required to display the string, and not individual * characters or bytes. * * This is useful if you want to display a part of a string on screen, as you * can pull out a specific portion by its display size. * * Padding: slicing can truncate multi-column characters. If the slice * truncates a character, the string will be padded with * pad_string, on the left side, right side, or both, as * necessary. If pad_string is nil then no padding * will be done. pad_string should be a single-column * string for this to make sense. * * For Ruby 1.8, the string is assumed to be in the current locale's encoding. * If not, terrible things will happen. * * For Ruby 1.9, the string will be magically converted from whatever encoding * it is in, into the current locale's encoding, for processing. This may fail. * * The returned string WILL be in the current locale encoding, regardless of the * encoding of the original string. */ static VALUE slice(int argc, VALUE *argv, VALUE v_self) { VALUE v_string, v_display_start, v_display_width, v_pad_string; rb_scan_args(argc, argv, "22", &v_string, &v_display_start, &v_display_width, &v_pad_string); Check_Type(v_string, T_STRING); /* try and mimic String#slice's argument handling as much as possible */ int display_start = NUM2INT(v_display_start); if(display_start < 0) display_start = NUM2LONG(display_width(v_self, v_string)) + display_start; // negative start means from the end of the string if(display_start < 0) return Qnil; // but if you go too far, you fail int display_width; if(argc < 3) display_width = 1; // default value just like String#slice (although it makes slightly less sense) else display_width = NUM2INT(v_display_width); if(display_width < 0) return Qnil; // you fail char* pad_string; if(argc < 4) pad_string = default_pad_string; // only fill in default if unspecified; nil is a valid value else if(v_pad_string == Qnil) pad_string = ""; else pad_string = RSTRING_PTR(v_pad_string); /* see comments in display_width() */ #ifdef HAVE_RUBY_ENCODING_H v_string = rb_str_encode(v_string, rb_enc_from_encoding(rb_locale_encoding()), 0, Qnil); #endif char* string = RSTRING_PTR(v_string); mbstate_t mbs; memset(&mbs, 0, sizeof(mbs)); long remaining_bytes = RSTRING_LEN(v_string); // first, advance the string pointer so that we've seen display_start width characters long current_width = 0; while((remaining_bytes > 0) && (current_width < display_start)) { wchar_t wc; size_t num_bytes = mbrtowc(&wc, string, remaining_bytes, &mbs); if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes); int width = wcwidth(wc); if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character"); current_width += width; remaining_bytes -= num_bytes; string += num_bytes; // advance string pointer } /* here's a weird behavior (to me!) of String#slice that we emulate: * if the start point is the string length itself, you get an empty * string back; if the start point is greater than that, you get nil. */ if((current_width < display_start)) return Qnil; /* determine left padding */ char* pad_left = ""; if((current_width > display_start) && (display_width > 0)) pad_left = pad_string; // now, advance the string_end pointer so that we've seen an additional display_width width characters char* string_end = string; current_width -= display_start; while((remaining_bytes > 0) && (current_width < display_width)) { wchar_t wc; size_t num_bytes = mbrtowc(&wc, string_end, remaining_bytes, &mbs); if(num_bytes == 0) rb_raise(rb_eArgError, "malformed string: NULL byte at position %ld", RSTRING_LEN(v_string) - remaining_bytes); int width = wcwidth(wc); if(width == 0) rb_raise(rb_eArgError, "bad string: non-printable character"); if((current_width + width) > display_width) break; // have to stop here current_width += width; remaining_bytes -= num_bytes; string_end += num_bytes; // advance string pointer } /* determine right padding */ char* pad_right = ""; if((current_width < display_width) && (remaining_bytes > 0)) pad_right = pad_string; // finally, construct a new string int bytesize = string_end - string; int leftsize = strlen(pad_left); int rightsize = strlen(pad_right); char* new_string = calloc(bytesize + leftsize + rightsize + 1, sizeof(char)); if(leftsize > 0) strcpy(new_string, pad_left); if(bytesize > 0) memcpy(new_string + leftsize, string, bytesize * sizeof(char)); if(rightsize > 0) strcpy(new_string + leftsize + bytesize, pad_right); (new_string + bytesize + leftsize + rightsize)[0] = 0; return rb_enc_str_new(new_string, bytesize + leftsize + rightsize, rb_enc_get(v_string)); } /* * A helper class for console-based programs that need to deal with non-ASCII * code. If you are writing a curses/ncurses program, or otherwise care about * the number of characters on the screen, this is crucial stuff. * * Provides: * * Console.display_width: get the number of display columns used by a string. * * Console.slice: get a substrig by display column offset and size. * */ void Init_console() { VALUE cConsole; cConsole = rb_define_class("Console", rb_cObject); rb_define_module_function(cConsole, "display_width", display_width, 1); rb_define_module_function(cConsole, "slice", slice, -1); }