ext/re2/re2.cc in re2-0.6.0.pre vs ext/re2/re2.cc in re2-0.6.0

- old
+ new

@@ -1,10 +1,10 @@ /* * re2 (http://github.com/mudge/re2) * Ruby bindings to re2, an "efficient, principled regular expression library" * - * Copyright (c) 2010-2013, Paul Mucur (http://mudge.name) + * Copyright (c) 2010-2014, Paul Mucur (http://mudge.name) * Released under the BSD Licence, please see LICENSE.txt */ #include <re2/re2.h> #include <ruby.h> @@ -20,18 +20,27 @@ extern "C" { #ifdef HAVE_RUBY_ENCODING_H #include <ruby/encoding.h> #define ENCODED_STR_NEW(str, length, encoding) \ ({ \ - VALUE _string = rb_str_new((const char *)str, (long)length); \ - int _enc = rb_enc_find_index((int)encoding); \ - rb_enc_associate_index(_string, _enc); \ - _string; \ - }) + VALUE _string = rb_str_new(str, length); \ + int _enc = rb_enc_find_index(encoding); \ + rb_enc_associate_index(_string, _enc); \ + _string; \ + }) + #define ENCODED_STR_NEW2(str, length, str2) \ + ({ \ + VALUE _string = rb_str_new(str, length); \ + int _enc = rb_enc_get_index(str2); \ + rb_enc_associate_index(_string, _enc); \ + _string; \ + }) #else #define ENCODED_STR_NEW(str, length, encoding) \ rb_str_new((const char *)str, (long)length) + #define ENCODED_STR_NEW2(str, length, str2) \ + rb_str_new((const char *)str, (long)length) #endif #define BOOL2RUBY(v) (v ? Qtrue : Qfalse) #define UNUSED(x) ((void)x) @@ -60,16 +69,16 @@ int number_of_matches; VALUE regexp, text; } re2_matchdata; typedef struct { - re2::StringPiece input; - int argc; + re2::StringPiece *input; + int number_of_capturing_groups; VALUE regexp, text; - } re2_consumer; + } re2_scanner; - VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cConsumer; + VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner; /* Symbols used in RE2 options. */ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors, id_max_mem, id_literal, id_never_nl, id_case_sensitive, id_perl_classes, id_word_boundary, id_one_line; @@ -84,16 +93,19 @@ delete[] self->matches; } free(self); } - void re2_consumer_mark(re2_consumer* self) { + void re2_scanner_mark(re2_scanner* self) { rb_gc_mark(self->regexp); rb_gc_mark(self->text); } - void re2_consumer_free(re2_consumer* self) { + void re2_scanner_free(re2_scanner* self) { + if (self->input) { + delete self->input; + } free(self); } void re2_regexp_free(re2_pattern* self) { if (self->pattern) { @@ -106,14 +118,14 @@ re2_matchdata *m; return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark, re2_matchdata_free, m); } - static VALUE re2_consumer_allocate(VALUE klass) { - re2_consumer *c; - return Data_Make_Struct(klass, re2_consumer, re2_consumer_mark, - re2_consumer_free, c); + static VALUE re2_scanner_allocate(VALUE klass) { + re2_scanner *c; + return Data_Make_Struct(klass, re2_scanner, re2_scanner_mark, + re2_scanner_free, c); } /* * Returns a frozen copy of the string passed into +match+. * @@ -128,76 +140,77 @@ return m->text; } /* - * Returns the string passed into the consumer. + * Returns the string passed into the scanner. * * @return [String] the original string. * @example - * c = RE2::Regexp.new('(\d+)').consume("foo") + * c = RE2::Regexp.new('(\d+)').scan("foo") * c.string #=> "foo" */ - static VALUE re2_consumer_string(VALUE self) { - re2_consumer *c; - Data_Get_Struct(self, re2_consumer, c); + static VALUE re2_scanner_string(VALUE self) { + re2_scanner *c; + Data_Get_Struct(self, re2_scanner, c); return c->text; } /* - * Rewind the consumer to the start of the string. + * Rewind the scanner to the start of the string. * * @example - * c = RE2::Regexp.new('(\d+)').consume("1 2 3") - * e = c.to_enum - * e.next #=> ["1"] - * e.next #=> ["2"] - * c.rewind - * e.next #=> ["1"] + * s = RE2::Regexp.new('(\d+)').scan("1 2 3") + * e = s.to_enum + * e.scan #=> ["1"] + * e.scan #=> ["2"] + * s.rewind + * e.scan #=> ["1"] */ - static VALUE re2_consumer_rewind(VALUE self) { - re2_consumer *c; - Data_Get_Struct(self, re2_consumer, c); - re2::StringPiece input(RSTRING_PTR(c->text)); + static VALUE re2_scanner_rewind(VALUE self) { + re2_scanner *c; + Data_Get_Struct(self, re2_scanner, c); - c->input = input; + c->input = new(nothrow) re2::StringPiece(StringValuePtr(c->text)); return self; } /* * Scan the given text incrementally for matches, returning an array of * matches on each subsequent call. Returns nil if no matches are found. * * @return [Array<String>] the matches. * @example - * c = RE2::Regexp.new('(\w+)').consume("Foo bar baz") - * c.consume #=> ["Foo"] - * c.consume #=> ["bar"] + * s = RE2::Regexp.new('(\w+)').scan("Foo bar baz") + * s.scan #=> ["Foo"] + * s.scan #=> ["bar"] */ - static VALUE re2_consumer_consume(VALUE self) { + static VALUE re2_scanner_scan(VALUE self) { int i; re2_pattern *p; - re2_consumer *c; + re2_scanner *c; VALUE result; - Data_Get_Struct(self, re2_consumer, c); + Data_Get_Struct(self, re2_scanner, c); Data_Get_Struct(c->regexp, re2_pattern, p); - vector<RE2::Arg> argv(c->argc); - vector<RE2::Arg*> args(c->argc); - vector<string> matches(c->argc); + vector<RE2::Arg> argv(c->number_of_capturing_groups); + vector<RE2::Arg*> args(c->number_of_capturing_groups); + vector<string> matches(c->number_of_capturing_groups); - for (i = 0; i < c->argc; i++) { - args[i] = &argv[i]; + for (i = 0; i < c->number_of_capturing_groups; i++) { + matches[i] = ""; argv[i] = &matches[i]; + args[i] = &argv[i]; } - if (RE2::FindAndConsumeN(&c->input, *p->pattern, &args[0], c->argc)) { - result = rb_ary_new2(c->argc); - for (i = 0; i < c->argc; i++) { + if (RE2::FindAndConsumeN(c->input, *p->pattern, &args[0], + c->number_of_capturing_groups)) { + result = rb_ary_new2(c->number_of_capturing_groups); + for (i = 0; i < c->number_of_capturing_groups; i++) { if (matches[i].empty()) { rb_ary_push(result, Qnil); } else { rb_ary_push(result, ENCODED_STR_NEW(matches[i].data(), matches[i].size(), @@ -240,20 +253,20 @@ Data_Get_Struct(self, re2_matchdata, m); return m->regexp; } /* - * Returns the {RE2::Regexp} used in the consumer. + * Returns the {RE2::Regexp} used in the scanner. * - * @return [RE2::Regexp] the regexp used in the consumer + * @return [RE2::Regexp] the regexp used in the scanner * @example - * c = RE2::Regexp.new('(\d+)').consume("bob 123") + * c = RE2::Regexp.new('(\d+)').scan("bob 123") * c.regexp #=> #<RE2::Regexp /(\d+)/> */ - static VALUE re2_consumer_regexp(VALUE self) { - re2_consumer *c; - Data_Get_Struct(self, re2_consumer, c); + static VALUE re2_scanner_regexp(VALUE self) { + re2_scanner *c; + Data_Get_Struct(self, re2_scanner, c); return c->regexp; } static VALUE re2_regexp_allocate(VALUE klass) { @@ -1059,130 +1072,104 @@ return re2_regexp_match(2, argv, self); } /* - * Returns a {RE2::Consumer} for scanning the given text incrementally. + * Returns a {RE2::Scanner} for scanning the given text incrementally. * * @example - * c = RE2::Regexp.new('(\w+)').consume("Foo bar baz") + * c = RE2::Regexp.new('(\w+)').scan("Foo bar baz") */ - static VALUE re2_regexp_consume(VALUE self, VALUE text) { + static VALUE re2_regexp_scan(VALUE self, VALUE text) { re2_pattern *p; - re2_consumer *c; - VALUE consumer; - re2::StringPiece input(RSTRING_PTR(text)); + re2_scanner *c; + VALUE scanner; Data_Get_Struct(self, re2_pattern, p); - consumer = rb_class_new_instance(0, 0, re2_cConsumer); - Data_Get_Struct(consumer, re2_consumer, c); - c->input = input; + scanner = rb_class_new_instance(0, 0, re2_cScanner); + Data_Get_Struct(scanner, re2_scanner, c); + + c->input = new(nothrow) re2::StringPiece(StringValuePtr(text)); c->regexp = self; c->text = text; - c->argc = p->pattern->NumberOfCapturingGroups(); + c->number_of_capturing_groups = p->pattern->NumberOfCapturingGroups(); - return consumer; + return scanner; } /* - * Replaces the first occurrence +pattern+ in +str+ with - * +rewrite+ <i>in place</i>. + * Returns a copy of +str+ with the first occurrence +pattern+ + * replaced with +rewrite+. * * @param [String] str the string to modify * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced * @param [String] rewrite the string to replace with * @return [String] the resulting string * @example * RE2.Replace("hello there", "hello", "howdy") #=> "howdy there" * re2 = RE2.new("hel+o") * RE2.Replace("hello there", re2, "yo") #=> "yo there" - * text = "Good morning" - * RE2.Replace(text, "morn", "even") #=> "Good evening" - * text #=> "Good evening" */ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite) { - - /* Look out for frozen strings. */ - rb_check_frozen(str); - UNUSED(self); - VALUE repl; re2_pattern *p; /* Convert all the inputs to be pumped into RE2::Replace. */ string str_as_string(StringValuePtr(str)); /* Do the replacement. */ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) { Data_Get_Struct(pattern, re2_pattern, p); RE2::Replace(&str_as_string, *p->pattern, StringValuePtr(rewrite)); + + return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"); } else { RE2::Replace(&str_as_string, StringValuePtr(pattern), StringValuePtr(rewrite)); - } - /* Save the replacement as a VALUE. */ - repl = rb_str_new(str_as_string.data(), str_as_string.size()); - - /* Replace the original string with the replacement. */ - if (RSTRING_LEN(str) != RSTRING_LEN(repl)) { - rb_str_resize(str, RSTRING_LEN(repl)); + return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(), + pattern); } - memcpy(RSTRING_PTR(str), RSTRING_PTR(repl), RSTRING_LEN(repl)); - return str; } /* - * Replaces every occurrence of +pattern+ in +str+ with - * +rewrite+ <i>in place</i>. + * Return a copy of +str+ with +pattern+ replaced by +rewrite+. * * @param [String] str the string to modify * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced * @param [String] rewrite the string to replace with * @return [String] the resulting string * @example - * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri" * re2 = RE2.new("oo?") * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps" - * text = "Good morning" - * RE2.GlobalReplace(text, "o", "ee") #=> "Geeeed meerning" - * text #=> "Geeeed meerning" + * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri" */ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite) { - - /* Look out for frozen strings. */ - rb_check_frozen(str); - UNUSED(self); /* Convert all the inputs to be pumped into RE2::GlobalReplace. */ re2_pattern *p; string str_as_string(StringValuePtr(str)); - VALUE repl; /* Do the replacement. */ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) { Data_Get_Struct(pattern, re2_pattern, p); RE2::GlobalReplace(&str_as_string, *p->pattern, StringValuePtr(rewrite)); + + return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"); } else { RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern), StringValuePtr(rewrite)); - } - /* Save the replacement as a VALUE. */ - repl = rb_str_new(str_as_string.data(), str_as_string.size()); - - /* Replace the original string with the replacement. */ - if (RSTRING_LEN(str) != RSTRING_LEN(repl)) { - rb_str_resize(str, RSTRING_LEN(repl)); + return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(), + pattern); } - memcpy(RSTRING_PTR(str), RSTRING_PTR(repl), RSTRING_LEN(repl)); - - return str; } /* * Returns a version of str with all potentially meaningful regexp * characters escaped. The returned string, used as a regular @@ -1201,17 +1188,17 @@ void Init_re2(void) { re2_mRE2 = rb_define_module("RE2"); re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject); re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject); - re2_cConsumer = rb_define_class_under(re2_mRE2, "Consumer", rb_cObject); + re2_cScanner = rb_define_class_under(re2_mRE2, "Scanner", rb_cObject); rb_define_alloc_func(re2_cRegexp, (VALUE (*)(VALUE))re2_regexp_allocate); rb_define_alloc_func(re2_cMatchData, (VALUE (*)(VALUE))re2_matchdata_allocate); - rb_define_alloc_func(re2_cConsumer, - (VALUE (*)(VALUE))re2_consumer_allocate); + rb_define_alloc_func(re2_cScanner, + (VALUE (*)(VALUE))re2_scanner_allocate); rb_define_method(re2_cMatchData, "string", RUBY_METHOD_FUNC(re2_matchdata_string), 0); rb_define_method(re2_cMatchData, "regexp", RUBY_METHOD_FUNC(re2_matchdata_regexp), 0); @@ -1225,18 +1212,18 @@ -1); rb_define_method(re2_cMatchData, "to_s", RUBY_METHOD_FUNC(re2_matchdata_to_s), 0); rb_define_method(re2_cMatchData, "inspect", RUBY_METHOD_FUNC(re2_matchdata_inspect), 0); - rb_define_method(re2_cConsumer, "string", - RUBY_METHOD_FUNC(re2_consumer_string), 0); - rb_define_method(re2_cConsumer, "regexp", - RUBY_METHOD_FUNC(re2_consumer_regexp), 0); - rb_define_method(re2_cConsumer, "consume", - RUBY_METHOD_FUNC(re2_consumer_consume), 0); - rb_define_method(re2_cConsumer, "rewind", - RUBY_METHOD_FUNC(re2_consumer_rewind), 0); + rb_define_method(re2_cScanner, "string", + RUBY_METHOD_FUNC(re2_scanner_string), 0); + rb_define_method(re2_cScanner, "regexp", + RUBY_METHOD_FUNC(re2_scanner_regexp), 0); + rb_define_method(re2_cScanner, "scan", + RUBY_METHOD_FUNC(re2_scanner_scan), 0); + rb_define_method(re2_cScanner, "rewind", + RUBY_METHOD_FUNC(re2_scanner_rewind), 0); rb_define_method(re2_cRegexp, "initialize", RUBY_METHOD_FUNC(re2_regexp_initialize), -1); rb_define_method(re2_cRegexp, "ok?", RUBY_METHOD_FUNC(re2_regexp_ok), 0); rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error), @@ -1257,10 +1244,11 @@ RUBY_METHOD_FUNC(re2_regexp_match_query), 1); rb_define_method(re2_cRegexp, "=~", RUBY_METHOD_FUNC(re2_regexp_match_query), 1); rb_define_method(re2_cRegexp, "===", RUBY_METHOD_FUNC(re2_regexp_match_query), 1); - rb_define_method(re2_cRegexp, "consume", RUBY_METHOD_FUNC(re2_regexp_consume), 1); + rb_define_method(re2_cRegexp, "scan", + RUBY_METHOD_FUNC(re2_regexp_scan), 1); rb_define_method(re2_cRegexp, "to_s", RUBY_METHOD_FUNC(re2_regexp_to_s), 0); rb_define_method(re2_cRegexp, "to_str", RUBY_METHOD_FUNC(re2_regexp_to_s), 0); rb_define_method(re2_cRegexp, "pattern", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);