ext/re2/re2.cc in re2-0.5.0 vs ext/re2/re2.cc in re2-0.6.0.pre

- old
+ new

@@ -1,22 +1,39 @@ /* * re2 (http://github.com/mudge/re2) * Ruby bindings to re2, an "efficient, principled regular expression library" * - * Copyright (c) 2010-2012, Paul Mucur (http://mudge.name) + * Copyright (c) 2010-2013, Paul Mucur (http://mudge.name) * Released under the BSD Licence, please see LICENSE.txt */ #include <re2/re2.h> +#include <ruby.h> #include <string> #include <sstream> -using namespace std; +#include <vector> +using std::string; +using std::ostringstream; +using std::nothrow; +using std::map; +using std::vector; extern "C" { + #ifdef HAVE_RUBY_ENCODING_H + #include <ruby/encoding.h> + #define ENCODED_STR_NEW(str, length, encoding) \ + ({ \ + VALUE _string = rb_str_new((const char *)str, (long)length); \ + int _enc = rb_enc_find_index((int)encoding); \ + rb_enc_associate_index(_string, _enc); \ + _string; \ + }) + #else + #define ENCODED_STR_NEW(str, length, encoding) \ + rb_str_new((const char *)str, (long)length) + #endif - #include <ruby.h> - #define BOOL2RUBY(v) (v ? Qtrue : Qfalse) #define UNUSED(x) ((void)x) #ifndef RSTRING_LEN #define RSTRING_LEN(x) (RSTRING(x)->len) @@ -42,76 +59,170 @@ re2::StringPiece *matches; int number_of_matches; VALUE regexp, text; } re2_matchdata; - VALUE re2_mRE2, re2_cRegexp, re2_cMatchData; + typedef struct { + re2::StringPiece input; + int argc; + VALUE regexp, text; + } re2_consumer; + VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cConsumer; + /* Symbols used in RE2 options. */ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors, id_max_mem, id_literal, id_never_nl, id_case_sensitive, id_perl_classes, id_word_boundary, id_one_line; - void re2_matchdata_mark(re2_matchdata* self) - { + void re2_matchdata_mark(re2_matchdata* self) { rb_gc_mark(self->regexp); rb_gc_mark(self->text); } - void re2_matchdata_free(re2_matchdata* self) - { + void re2_matchdata_free(re2_matchdata* self) { if (self->matches) { delete[] self->matches; } free(self); } - void - re2_regexp_free(re2_pattern* self) - { + void re2_consumer_mark(re2_consumer* self) { + rb_gc_mark(self->regexp); + rb_gc_mark(self->text); + } + + void re2_consumer_free(re2_consumer* self) { + free(self); + } + + void re2_regexp_free(re2_pattern* self) { if (self->pattern) { delete self->pattern; } free(self); } - static VALUE - re2_matchdata_allocate(VALUE klass) - { + static VALUE re2_matchdata_allocate(VALUE klass) { re2_matchdata *m; - return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark, re2_matchdata_free, m); + return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark, + re2_matchdata_free, m); } + static VALUE re2_consumer_allocate(VALUE klass) { + re2_consumer *c; + return Data_Make_Struct(klass, re2_consumer, re2_consumer_mark, + re2_consumer_free, c); + } + /* * Returns a frozen copy of the string passed into +match+. * * @return [String] a frozen copy of the passed string. * @example * m = RE2::Regexp.new('(\d+)').match("bob 123") * m.string #=> "bob 123" */ - static VALUE - re2_matchdata_string(VALUE self) - { + static VALUE re2_matchdata_string(VALUE self) { re2_matchdata *m; Data_Get_Struct(self, re2_matchdata, m); return m->text; } /* + * Returns the string passed into the consumer. + * + * @return [String] the original string. + * @example + * c = RE2::Regexp.new('(\d+)').consume("foo") + * c.string #=> "foo" + */ + static VALUE re2_consumer_string(VALUE self) { + re2_consumer *c; + Data_Get_Struct(self, re2_consumer, c); + + return c->text; + } + + /* + * Rewind the consumer to the start of the string. + * + * @example + * c = RE2::Regexp.new('(\d+)').consume("1 2 3") + * e = c.to_enum + * e.next #=> ["1"] + * e.next #=> ["2"] + * c.rewind + * e.next #=> ["1"] + */ + static VALUE re2_consumer_rewind(VALUE self) { + re2_consumer *c; + Data_Get_Struct(self, re2_consumer, c); + re2::StringPiece input(RSTRING_PTR(c->text)); + + c->input = input; + + return self; + } + + /* + * Scan the given text incrementally for matches, returning an array of + * matches on each subsequent call. Returns nil if no matches are found. + * + * @return [Array<String>] the matches. + * @example + * c = RE2::Regexp.new('(\w+)').consume("Foo bar baz") + * c.consume #=> ["Foo"] + * c.consume #=> ["bar"] + */ + static VALUE re2_consumer_consume(VALUE self) { + int i; + re2_pattern *p; + re2_consumer *c; + VALUE result; + + Data_Get_Struct(self, re2_consumer, c); + Data_Get_Struct(c->regexp, re2_pattern, p); + + vector<RE2::Arg> argv(c->argc); + vector<RE2::Arg*> args(c->argc); + vector<string> matches(c->argc); + + for (i = 0; i < c->argc; i++) { + args[i] = &argv[i]; + argv[i] = &matches[i]; + } + + if (RE2::FindAndConsumeN(&c->input, *p->pattern, &args[0], c->argc)) { + result = rb_ary_new2(c->argc); + for (i = 0; i < c->argc; i++) { + if (matches[i].empty()) { + rb_ary_push(result, Qnil); + } else { + rb_ary_push(result, ENCODED_STR_NEW(matches[i].data(), + matches[i].size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1")); + } + } + } else { + result = Qnil; + } + + return result; + } + + /* * Returns the number of elements in the match array (including nils). * * @return [Fixnum] the number of elements * @example * m = RE2::Regexp.new('(\d+)').match("bob 123") * m.size #=> 2 * m.length #=> 2 */ - static VALUE - re2_matchdata_size(VALUE self) - { + static VALUE re2_matchdata_size(VALUE self) { re2_matchdata *m; Data_Get_Struct(self, re2_matchdata, m); return INT2FIX(m->number_of_matches); } @@ -122,21 +233,32 @@ * @return [RE2::Regexp] the regexp used in the match * @example * m = RE2::Regexp.new('(\d+)').match("bob 123") * m.regexp #=> #<RE2::Regexp /(\d+)/> */ - static VALUE - re2_matchdata_regexp(VALUE self) - { + static VALUE re2_matchdata_regexp(VALUE self) { re2_matchdata *m; Data_Get_Struct(self, re2_matchdata, m); return m->regexp; } - static VALUE - re2_regexp_allocate(VALUE klass) - { + /* + * Returns the {RE2::Regexp} used in the consumer. + * + * @return [RE2::Regexp] the regexp used in the consumer + * @example + * c = RE2::Regexp.new('(\d+)').consume("bob 123") + * c.regexp #=> #<RE2::Regexp /(\d+)/> + */ + static VALUE re2_consumer_regexp(VALUE self) { + re2_consumer *c; + Data_Get_Struct(self, re2_consumer, c); + + return c->regexp; + } + + static VALUE re2_regexp_allocate(VALUE klass) { re2_pattern *p; return Data_Make_Struct(klass, re2_pattern, 0, re2_regexp_free, p); } /* @@ -145,57 +267,57 @@ * @return [Array<String, nil>] the array of matches * @example * m = RE2::Regexp.new('(\d+)').match("bob 123") * m.to_a #=> ["123", "123"] */ - static VALUE - re2_matchdata_to_a(VALUE self) - { + static VALUE re2_matchdata_to_a(VALUE self) { int i; re2_matchdata *m; + re2_pattern *p; re2::StringPiece match; VALUE array; Data_Get_Struct(self, re2_matchdata, m); + Data_Get_Struct(m->regexp, re2_pattern, p); array = rb_ary_new2(m->number_of_matches); for (i = 0; i < m->number_of_matches; i++) { if (m->matches[i].empty()) { rb_ary_push(array, Qnil); } else { match = m->matches[i]; - rb_ary_push(array, rb_str_new(match.data(), match.size())); + rb_ary_push(array, ENCODED_STR_NEW(match.data(), match.size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1")); } } return array; } - static VALUE - re2_matchdata_nth_match(int nth, VALUE self) - { + static VALUE re2_matchdata_nth_match(int nth, VALUE self) { re2_matchdata *m; + re2_pattern *p; re2::StringPiece match; Data_Get_Struct(self, re2_matchdata, m); + Data_Get_Struct(m->regexp, re2_pattern, p); if (nth < 0 || nth >= m->number_of_matches) { return Qnil; } else { match = m->matches[nth]; if (match.empty()) { return Qnil; } else { - return rb_str_new(match.data(), match.size()); + return ENCODED_STR_NEW(match.data(), match.size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"); } } } - static VALUE - re2_matchdata_named_match(const char* name, VALUE self) - { + static VALUE re2_matchdata_named_match(const char* name, VALUE self) { int idx; re2_matchdata *m; re2_pattern *p; map<string, int> groups; string name_as_string(name); @@ -254,13 +376,11 @@ * @example * m = RE2::Regexp.new('(?P<number>\d+)').match("bob 123") * m["number"] #=> "123" * m[:number] #=> "123" */ - static VALUE - re2_matchdata_aref(int argc, VALUE *argv, VALUE self) - { + static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) { VALUE idx, rest; rb_scan_args(argc, argv, "11", &idx, &rest); if (TYPE(idx) == T_STRING) { return re2_matchdata_named_match(StringValuePtr(idx), self); @@ -276,13 +396,11 @@ /* * Returns the entire matched string. * * @return [String] the entire matched string */ - static VALUE - re2_matchdata_to_s(VALUE self) - { + static VALUE re2_matchdata_to_s(VALUE self) { return re2_matchdata_nth_match(0, self); } /* * Returns a printable version of the match. @@ -290,22 +408,20 @@ * @return [String] a printable version of the match * @example * m = RE2::Regexp.new('(\d+)').match("bob 123") * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">" */ - static VALUE - re2_matchdata_inspect(VALUE self) - { + static VALUE re2_matchdata_inspect(VALUE self) { int i; re2_matchdata *m; + re2_pattern *p; VALUE match, result; ostringstream output; Data_Get_Struct(self, re2_matchdata, m); + Data_Get_Struct(m->regexp, re2_pattern, p); - result = rb_str_new("#<RE2::MatchData", 16); - output << "#<RE2::MatchData"; for (i = 0; i < m->number_of_matches; i++) { output << " "; @@ -322,11 +438,12 @@ } } output << ">"; - result = rb_str_new(output.str().data(), output.str().length()); + result = ENCODED_STR_NEW(output.str().data(), output.str().length(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"); return result; } /* @@ -337,13 +454,11 @@ * @param [String] pattern the pattern to compile * @param [Hash] options the options to compile a regexp with * @see RE2::Regexp.new * */ - static VALUE - re2_re2(int argc, VALUE *argv, VALUE self) - { + static VALUE re2_re2(int argc, VALUE *argv, VALUE self) { UNUSED(self); return rb_class_new_instance(argc, argv, re2_cRegexp); } /* @@ -356,11 +471,12 @@ * Returns a new {RE2::Regexp} object with a compiled version of * +pattern+ stored inside with the default options. * * @param [String] pattern the pattern to compile * @return [RE2::Regexp] an RE2::Regexp with the specified pattern - * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern + * @raise [NoMemoryError] if memory could not be allocated for the compiled + * pattern * * @overload initialize(pattern, options) * Returns a new {RE2::Regexp} object with a compiled version of * +pattern+ stored inside with the specified options. * @@ -378,15 +494,13 @@ * @option options [Boolean] :word_boundary (false) allow \b \B (word boundary and not) when in posix_syntax mode * @option options [Boolean] :one_line (false) ^ and $ only match beginning and end of text when in posix_syntax mode * @return [RE2::Regexp] an RE2::Regexp with the specified pattern and options * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern */ - static VALUE - re2_regexp_initialize(int argc, VALUE *argv, VALUE self) - { + static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) { VALUE pattern, options, utf8, posix_syntax, longest_match, log_errors, - max_mem, literal, never_nl, case_sensitive, perl_classes, + max_mem, literal, never_nl, case_sensitive, perl_classes, word_boundary, one_line; re2_pattern *p; rb_scan_args(argc, argv, "11", &pattern, &options); Data_Get_Struct(self, re2_pattern, p); @@ -451,13 +565,13 @@ one_line = rb_hash_aref(options, ID2SYM(id_one_line)); if (!NIL_P(one_line)) { re2_options.set_one_line(RTEST(one_line)); } - p->pattern = new (nothrow) RE2(StringValuePtr(pattern), re2_options); + p->pattern = new(nothrow) RE2(StringValuePtr(pattern), re2_options); } else { - p->pattern = new (nothrow) RE2(StringValuePtr(pattern)); + p->pattern = new(nothrow) RE2(StringValuePtr(pattern)); } if (p->pattern == 0) { rb_raise(rb_eNoMemError, "not enough memory to allocate RE2 object"); } @@ -471,22 +585,21 @@ * @return [String] a printable version of the regular expression * @example * re2 = RE2::Regexp.new("woo?") * re2.inspect #=> "#<RE2::Regexp /woo?/>" */ - static VALUE - re2_regexp_inspect(VALUE self) - { + static VALUE re2_regexp_inspect(VALUE self) { re2_pattern *p; VALUE result; ostringstream output; Data_Get_Struct(self, re2_pattern, p); output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>"; - result = rb_str_new(output.str().data(), output.str().length()); + result = ENCODED_STR_NEW(output.str().data(), output.str().length(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"); return result; } /* @@ -495,16 +608,16 @@ * @return [String] a string version of the regular expression * @example * re2 = RE2::Regexp.new("woo?") * re2.to_s #=> "woo?" */ - static VALUE - re2_regexp_to_s(VALUE self) - { + static VALUE re2_regexp_to_s(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); - return rb_str_new(p->pattern->pattern().data(), p->pattern->pattern().size()); + return ENCODED_STR_NEW(p->pattern->pattern().data(), + p->pattern->pattern().size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"); } /* * Returns whether or not the regular expression +re2+ * was compiled successfully or not. @@ -512,13 +625,11 @@ * @return [Boolean] whether or not compilation was successful * @example * re2 = RE2::Regexp.new("woo?") * re2.ok? #=> true */ - static VALUE - re2_regexp_ok(VALUE self) - { + static VALUE re2_regexp_ok(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->ok()); } @@ -529,13 +640,11 @@ * @return [Boolean] the utf8 option * @example * re2 = RE2::Regexp.new("woo?", :utf8 => true) * re2.utf8? #=> true */ - static VALUE - re2_regexp_utf8(VALUE self) - { + static VALUE re2_regexp_utf8(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().utf8()); } @@ -546,13 +655,11 @@ * @return [Boolean] the posix_syntax option * @example * re2 = RE2::Regexp.new("woo?", :posix_syntax => true) * re2.posix_syntax? #=> true */ - static VALUE - re2_regexp_posix_syntax(VALUE self) - { + static VALUE re2_regexp_posix_syntax(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().posix_syntax()); } @@ -563,13 +670,11 @@ * @return [Boolean] the longest_match option * @example * re2 = RE2::Regexp.new("woo?", :longest_match => true) * re2.longest_match? #=> true */ - static VALUE - re2_regexp_longest_match(VALUE self) - { + static VALUE re2_regexp_longest_match(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().longest_match()); } @@ -580,13 +685,11 @@ * @return [Boolean] the log_errors option * @example * re2 = RE2::Regexp.new("woo?", :log_errors => true) * re2.log_errors? #=> true */ - static VALUE - re2_regexp_log_errors(VALUE self) - { + static VALUE re2_regexp_log_errors(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().log_errors()); } @@ -597,13 +700,11 @@ * @return [Fixnum] the max_mem option * @example * re2 = RE2::Regexp.new("woo?", :max_mem => 1024) * re2.max_mem #=> 1024 */ - static VALUE - re2_regexp_max_mem(VALUE self) - { + static VALUE re2_regexp_max_mem(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return INT2FIX(p->pattern->options().max_mem()); } @@ -614,13 +715,11 @@ * @return [Boolean] the literal option * @example * re2 = RE2::Regexp.new("woo?", :literal => true) * re2.literal? #=> true */ - static VALUE - re2_regexp_literal(VALUE self) - { + static VALUE re2_regexp_literal(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().literal()); } @@ -631,13 +730,11 @@ * @return [Boolean] the never_nl option * @example * re2 = RE2::Regexp.new("woo?", :never_nl => true) * re2.never_nl? #=> true */ - static VALUE - re2_regexp_never_nl(VALUE self) - { + static VALUE re2_regexp_never_nl(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().never_nl()); } @@ -648,13 +745,11 @@ * @return [Boolean] the case_sensitive option * @example * re2 = RE2::Regexp.new("woo?", :case_sensitive => true) * re2.case_sensitive? #=> true */ - static VALUE - re2_regexp_case_sensitive(VALUE self) - { + static VALUE re2_regexp_case_sensitive(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().case_sensitive()); } @@ -666,13 +761,11 @@ * @example * re2 = RE2::Regexp.new("woo?", :case_sensitive => true) * re2.case_insensitive? #=> false * re2.casefold? #=> false */ - static VALUE - re2_regexp_case_insensitive(VALUE self) - { + static VALUE re2_regexp_case_insensitive(VALUE self) { return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue); } /* * Returns whether or not the regular expression +re2+ @@ -681,13 +774,11 @@ * @return [Boolean] the perl_classes option * @example * re2 = RE2::Regexp.new("woo?", :perl_classes => true) * re2.perl_classes? #=> true */ - static VALUE - re2_regexp_perl_classes(VALUE self) - { + static VALUE re2_regexp_perl_classes(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().perl_classes()); } @@ -698,13 +789,11 @@ * @return [Boolean] the word_boundary option * @example * re2 = RE2::Regexp.new("woo?", :word_boundary => true) * re2.word_boundary? #=> true */ - static VALUE - re2_regexp_word_boundary(VALUE self) - { + static VALUE re2_regexp_word_boundary(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().word_boundary()); } @@ -715,13 +804,11 @@ * @return [Boolean] the one_line option * @example * re2 = RE2::Regexp.new("woo?", :one_line => true) * re2.one_line? #=> true */ - static VALUE - re2_regexp_one_line(VALUE self) - { + static VALUE re2_regexp_one_line(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return BOOL2RUBY(p->pattern->options().one_line()); } @@ -729,13 +816,11 @@ * If the RE2 could not be created properly, returns an * error string otherwise returns nil. * * @return [String, nil] the error string or nil */ - static VALUE - re2_regexp_error(VALUE self) - { + static VALUE re2_regexp_error(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); if (p->pattern->ok()) { return Qnil; } else { @@ -747,32 +832,30 @@ * If the RE2 could not be created properly, returns * the offending portion of the regexp otherwise returns nil. * * @return [String, nil] the offending portion of the regexp or nil */ - static VALUE - re2_regexp_error_arg(VALUE self) - { + static VALUE re2_regexp_error_arg(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); if (p->pattern->ok()) { return Qnil; } else { - return rb_str_new(p->pattern->error_arg().data(), p->pattern->error_arg().size()); + return ENCODED_STR_NEW(p->pattern->error_arg().data(), + p->pattern->error_arg().size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"); } } /* * Returns the program size, a very approximate measure * of a regexp's "cost". Larger numbers are more expensive * than smaller numbers. * * @return [Fixnum] the regexp "cost" */ - static VALUE - re2_regexp_program_size(VALUE self) - { + static VALUE re2_regexp_program_size(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return INT2FIX(p->pattern->ProgramSize()); } @@ -780,13 +863,11 @@ * Returns a hash of the options currently set for * +re2+. * * @return [Hash] the options */ - static VALUE - re2_regexp_options(VALUE self) - { + static VALUE re2_regexp_options(VALUE self) { VALUE options; re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); options = rb_hash_new(); @@ -835,13 +916,11 @@ * wasn't valid on construction. The overall match ($0) does not * count: if the regexp is "(a)(b)", returns 2. * * @return [Fixnum] the number of capturing subpatterns */ - static VALUE - re2_regexp_number_of_capturing_groups(VALUE self) - { + static VALUE re2_regexp_number_of_capturing_groups(VALUE self) { re2_pattern *p; Data_Get_Struct(self, re2_pattern, p); return INT2FIX(p->pattern->NumberOfCapturingGroups()); } @@ -849,13 +928,11 @@ /* * Returns a hash of names to capturing indices of groups. * * @return [Hash] a hash of names to capturing indices */ - static VALUE - re2_regexp_named_capturing_groups(VALUE self) - { + static VALUE re2_regexp_named_capturing_groups(VALUE self) { VALUE capturing_groups; re2_pattern *p; map<string, int> groups; map<string, int>::iterator iterator; @@ -863,11 +940,12 @@ groups = p->pattern->NamedCapturingGroups(); capturing_groups = rb_hash_new(); for (iterator = groups.begin(); iterator != groups.end(); iterator++) { rb_hash_aset(capturing_groups, - rb_str_new(iterator->first.data(), iterator->first.size()), + ENCODED_STR_NEW(iterator->first.data(), iterator->first.size(), + p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"), INT2FIX(iterator->second)); } return capturing_groups; } @@ -914,13 +992,11 @@ * @example * r = RE2::Regexp.new('w(o)(o)') * r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o"> * r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil> */ - static VALUE - re2_regexp_match(int argc, VALUE *argv, VALUE self) - { + static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) { int n; bool matched; re2_pattern *p; re2_matchdata *m; VALUE text, number_of_matches, matchdata; @@ -934,31 +1010,35 @@ } else { n = p->pattern->NumberOfCapturingGroups(); } if (n == 0) { - matched = match(p->pattern, StringValuePtr(text), 0, (int)RSTRING_LEN(text), RE2::UNANCHORED, 0, 0); + matched = match(p->pattern, StringValuePtr(text), 0, + static_cast<int>(RSTRING_LEN(text)), RE2::UNANCHORED, 0, 0); return BOOL2RUBY(matched); } else { /* Because match returns the whole match as well. */ n += 1; matchdata = rb_class_new_instance(0, 0, re2_cMatchData); Data_Get_Struct(matchdata, re2_matchdata, m); - m->matches = new (nothrow) re2::StringPiece[n]; + m->matches = new(nothrow) re2::StringPiece[n]; m->regexp = self; m->text = rb_str_dup(text); rb_str_freeze(m->text); if (m->matches == 0) { - rb_raise(rb_eNoMemError, "not enough memory to allocate StringPieces for matches"); + rb_raise(rb_eNoMemError, + "not enough memory to allocate StringPieces for matches"); } m->number_of_matches = n; - matched = match(p->pattern, StringValuePtr(text), 0, (int)RSTRING_LEN(text), RE2::UNANCHORED, m->matches, n); + matched = match(p->pattern, StringValuePtr(text), 0, + static_cast<int>(RSTRING_LEN(text)), + RE2::UNANCHORED, m->matches, n); if (matched) { return matchdata; } else { return Qnil; @@ -970,22 +1050,43 @@ * Returns true or false to indicate a successful match. * Equivalent to +re2.match(text, 0)+. * * @return [Boolean] whether the match was successful */ - static VALUE - re2_regexp_match_query(VALUE self, VALUE text) - { + static VALUE re2_regexp_match_query(VALUE self, VALUE text) { VALUE argv[2]; argv[0] = text; argv[1] = INT2FIX(0); return re2_regexp_match(2, argv, self); } /* - * Replaces the first occurrence +pattern+ in +str+ with + * Returns a {RE2::Consumer} for scanning the given text incrementally. + * + * @example + * c = RE2::Regexp.new('(\w+)').consume("Foo bar baz") + */ + static VALUE re2_regexp_consume(VALUE self, VALUE text) { + re2_pattern *p; + re2_consumer *c; + VALUE consumer; + re2::StringPiece input(RSTRING_PTR(text)); + + Data_Get_Struct(self, re2_pattern, p); + consumer = rb_class_new_instance(0, 0, re2_cConsumer); + Data_Get_Struct(consumer, re2_consumer, c); + c->input = input; + c->regexp = self; + c->text = text; + c->argc = p->pattern->NumberOfCapturingGroups(); + + return consumer; + } + + /* + * Replaces the first occurrence +pattern+ in +str+ with * +rewrite+ <i>in place</i>. * * @param [String] str the string to modify * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced * @param [String] rewrite the string to replace with @@ -994,15 +1095,14 @@ * RE2.Replace("hello there", "hello", "howdy") #=> "howdy there" * re2 = RE2.new("hel+o") * RE2.Replace("hello there", re2, "yo") #=> "yo there" * text = "Good morning" * RE2.Replace(text, "morn", "even") #=> "Good evening" - * text #=> "Good evening" + * text #=> "Good evening" */ - static VALUE - re2_Replace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite) - { + static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern, + VALUE rewrite) { /* Look out for frozen strings. */ rb_check_frozen(str); UNUSED(self); @@ -1015,11 +1115,12 @@ /* Do the replacement. */ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) { Data_Get_Struct(pattern, re2_pattern, p); RE2::Replace(&str_as_string, *p->pattern, StringValuePtr(rewrite)); } else { - RE2::Replace(&str_as_string, StringValuePtr(pattern), StringValuePtr(rewrite)); + RE2::Replace(&str_as_string, StringValuePtr(pattern), + StringValuePtr(rewrite)); } /* Save the replacement as a VALUE. */ repl = rb_str_new(str_as_string.data(), str_as_string.size()); @@ -1031,11 +1132,11 @@ return str; } /* - * Replaces every occurrence of +pattern+ in +str+ with + * Replaces every occurrence of +pattern+ in +str+ with * +rewrite+ <i>in place</i>. * * @param [String] str the string to modify * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced * @param [String] rewrite the string to replace with @@ -1046,13 +1147,12 @@ * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps" * text = "Good morning" * RE2.GlobalReplace(text, "o", "ee") #=> "Geeeed meerning" * text #=> "Geeeed meerning" */ - static VALUE - re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite) - { + static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern, + VALUE rewrite) { /* Look out for frozen strings. */ rb_check_frozen(str); UNUSED(self); @@ -1065,11 +1165,12 @@ /* Do the replacement. */ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) { Data_Get_Struct(pattern, re2_pattern, p); RE2::GlobalReplace(&str_as_string, *p->pattern, StringValuePtr(rewrite)); } else { - RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern), StringValuePtr(rewrite)); + RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern), + StringValuePtr(rewrite)); } /* Save the replacement as a VALUE. */ repl = rb_str_new(str_as_string.data(), str_as_string.size()); @@ -1090,73 +1191,124 @@ * @param [String] unquoted the unquoted string * @return [String] the escaped string * @example * RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?" */ - static VALUE - re2_QuoteMeta(VALUE self, VALUE unquoted) - { + static VALUE re2_QuoteMeta(VALUE self, VALUE unquoted) { UNUSED(self); string quoted_string = RE2::QuoteMeta(StringValuePtr(unquoted)); return rb_str_new(quoted_string.data(), quoted_string.size()); } - void - Init_re2() - { + void Init_re2(void) { re2_mRE2 = rb_define_module("RE2"); re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject); re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject); + re2_cConsumer = rb_define_class_under(re2_mRE2, "Consumer", rb_cObject); rb_define_alloc_func(re2_cRegexp, (VALUE (*)(VALUE))re2_regexp_allocate); - rb_define_alloc_func(re2_cMatchData, (VALUE (*)(VALUE))re2_matchdata_allocate); + rb_define_alloc_func(re2_cMatchData, + (VALUE (*)(VALUE))re2_matchdata_allocate); + rb_define_alloc_func(re2_cConsumer, + (VALUE (*)(VALUE))re2_consumer_allocate); - rb_define_method(re2_cMatchData, "string", RUBY_METHOD_FUNC(re2_matchdata_string), 0); - rb_define_method(re2_cMatchData, "regexp", RUBY_METHOD_FUNC(re2_matchdata_regexp), 0); - rb_define_method(re2_cMatchData, "to_a", RUBY_METHOD_FUNC(re2_matchdata_to_a), 0); - rb_define_method(re2_cMatchData, "size", RUBY_METHOD_FUNC(re2_matchdata_size), 0); - rb_define_method(re2_cMatchData, "length", RUBY_METHOD_FUNC(re2_matchdata_size), 0); - rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref), -1); - rb_define_method(re2_cMatchData, "to_s", RUBY_METHOD_FUNC(re2_matchdata_to_s), 0); - rb_define_method(re2_cMatchData, "inspect", RUBY_METHOD_FUNC(re2_matchdata_inspect), 0); + rb_define_method(re2_cMatchData, "string", + RUBY_METHOD_FUNC(re2_matchdata_string), 0); + rb_define_method(re2_cMatchData, "regexp", + RUBY_METHOD_FUNC(re2_matchdata_regexp), 0); + rb_define_method(re2_cMatchData, "to_a", + RUBY_METHOD_FUNC(re2_matchdata_to_a), 0); + rb_define_method(re2_cMatchData, "size", + RUBY_METHOD_FUNC(re2_matchdata_size), 0); + rb_define_method(re2_cMatchData, "length", + RUBY_METHOD_FUNC(re2_matchdata_size), 0); + rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref), + -1); rb_define_method(re2_cMatchData, "to_s", + RUBY_METHOD_FUNC(re2_matchdata_to_s), 0); + rb_define_method(re2_cMatchData, "inspect", + RUBY_METHOD_FUNC(re2_matchdata_inspect), 0); - rb_define_method(re2_cRegexp, "initialize", RUBY_METHOD_FUNC(re2_regexp_initialize), -1); + rb_define_method(re2_cConsumer, "string", + RUBY_METHOD_FUNC(re2_consumer_string), 0); + rb_define_method(re2_cConsumer, "regexp", + RUBY_METHOD_FUNC(re2_consumer_regexp), 0); + rb_define_method(re2_cConsumer, "consume", + RUBY_METHOD_FUNC(re2_consumer_consume), 0); + rb_define_method(re2_cConsumer, "rewind", + RUBY_METHOD_FUNC(re2_consumer_rewind), 0); + + rb_define_method(re2_cRegexp, "initialize", + RUBY_METHOD_FUNC(re2_regexp_initialize), -1); rb_define_method(re2_cRegexp, "ok?", RUBY_METHOD_FUNC(re2_regexp_ok), 0); - rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error), 0); - rb_define_method(re2_cRegexp, "error_arg", RUBY_METHOD_FUNC(re2_regexp_error_arg), 0); - rb_define_method(re2_cRegexp, "program_size", RUBY_METHOD_FUNC(re2_regexp_program_size), 0); - rb_define_method(re2_cRegexp, "options", RUBY_METHOD_FUNC(re2_regexp_options), 0); - rb_define_method(re2_cRegexp, "number_of_capturing_groups", RUBY_METHOD_FUNC(re2_regexp_number_of_capturing_groups), 0); - rb_define_method(re2_cRegexp, "named_capturing_groups", RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0); - rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match), -1); - rb_define_method(re2_cRegexp, "match?", RUBY_METHOD_FUNC(re2_regexp_match_query), 1); - rb_define_method(re2_cRegexp, "=~", RUBY_METHOD_FUNC(re2_regexp_match_query), 1); - rb_define_method(re2_cRegexp, "===", RUBY_METHOD_FUNC(re2_regexp_match_query), 1); + rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error), + 0); + rb_define_method(re2_cRegexp, "error_arg", + RUBY_METHOD_FUNC(re2_regexp_error_arg), 0); + rb_define_method(re2_cRegexp, "program_size", + RUBY_METHOD_FUNC(re2_regexp_program_size), 0); + rb_define_method(re2_cRegexp, "options", + RUBY_METHOD_FUNC(re2_regexp_options), 0); + rb_define_method(re2_cRegexp, "number_of_capturing_groups", + RUBY_METHOD_FUNC(re2_regexp_number_of_capturing_groups), 0); + rb_define_method(re2_cRegexp, "named_capturing_groups", + RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0); + rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match), + -1); + rb_define_method(re2_cRegexp, "match?", + RUBY_METHOD_FUNC(re2_regexp_match_query), 1); + rb_define_method(re2_cRegexp, "=~", + RUBY_METHOD_FUNC(re2_regexp_match_query), 1); + rb_define_method(re2_cRegexp, "===", + RUBY_METHOD_FUNC(re2_regexp_match_query), 1); + rb_define_method(re2_cRegexp, "consume", RUBY_METHOD_FUNC(re2_regexp_consume), 1); rb_define_method(re2_cRegexp, "to_s", RUBY_METHOD_FUNC(re2_regexp_to_s), 0); - rb_define_method(re2_cRegexp, "to_str", RUBY_METHOD_FUNC(re2_regexp_to_s), 0); - rb_define_method(re2_cRegexp, "pattern", RUBY_METHOD_FUNC(re2_regexp_to_s), 0); - rb_define_method(re2_cRegexp, "source", RUBY_METHOD_FUNC(re2_regexp_to_s), 0); - rb_define_method(re2_cRegexp, "inspect", RUBY_METHOD_FUNC(re2_regexp_inspect), 0); - rb_define_method(re2_cRegexp, "utf8?", RUBY_METHOD_FUNC(re2_regexp_utf8), 0); - rb_define_method(re2_cRegexp, "posix_syntax?", RUBY_METHOD_FUNC(re2_regexp_posix_syntax), 0); - rb_define_method(re2_cRegexp, "longest_match?", RUBY_METHOD_FUNC(re2_regexp_longest_match), 0); - rb_define_method(re2_cRegexp, "log_errors?", RUBY_METHOD_FUNC(re2_regexp_log_errors), 0); - rb_define_method(re2_cRegexp, "max_mem", RUBY_METHOD_FUNC(re2_regexp_max_mem), 0); - rb_define_method(re2_cRegexp, "literal?", RUBY_METHOD_FUNC(re2_regexp_literal), 0); - rb_define_method(re2_cRegexp, "never_nl?", RUBY_METHOD_FUNC(re2_regexp_never_nl), 0); - rb_define_method(re2_cRegexp, "case_sensitive?", RUBY_METHOD_FUNC(re2_regexp_case_sensitive), 0); - rb_define_method(re2_cRegexp, "case_insensitive?", RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0); - rb_define_method(re2_cRegexp, "casefold?", RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0); - rb_define_method(re2_cRegexp, "perl_classes?", RUBY_METHOD_FUNC(re2_regexp_perl_classes), 0); - rb_define_method(re2_cRegexp, "word_boundary?", RUBY_METHOD_FUNC(re2_regexp_word_boundary), 0); - rb_define_method(re2_cRegexp, "one_line?", RUBY_METHOD_FUNC(re2_regexp_one_line), 0); + rb_define_method(re2_cRegexp, "to_str", RUBY_METHOD_FUNC(re2_regexp_to_s), + 0); + rb_define_method(re2_cRegexp, "pattern", RUBY_METHOD_FUNC(re2_regexp_to_s), + 0); + rb_define_method(re2_cRegexp, "source", RUBY_METHOD_FUNC(re2_regexp_to_s), + 0); + rb_define_method(re2_cRegexp, "inspect", + RUBY_METHOD_FUNC(re2_regexp_inspect), 0); + rb_define_method(re2_cRegexp, "utf8?", RUBY_METHOD_FUNC(re2_regexp_utf8), + 0); + rb_define_method(re2_cRegexp, "posix_syntax?", + RUBY_METHOD_FUNC(re2_regexp_posix_syntax), 0); + rb_define_method(re2_cRegexp, "longest_match?", + RUBY_METHOD_FUNC(re2_regexp_longest_match), 0); + rb_define_method(re2_cRegexp, "log_errors?", + RUBY_METHOD_FUNC(re2_regexp_log_errors), 0); + rb_define_method(re2_cRegexp, "max_mem", + RUBY_METHOD_FUNC(re2_regexp_max_mem), 0); + rb_define_method(re2_cRegexp, "literal?", + RUBY_METHOD_FUNC(re2_regexp_literal), 0); + rb_define_method(re2_cRegexp, "never_nl?", + RUBY_METHOD_FUNC(re2_regexp_never_nl), 0); + rb_define_method(re2_cRegexp, "case_sensitive?", + RUBY_METHOD_FUNC(re2_regexp_case_sensitive), 0); + rb_define_method(re2_cRegexp, "case_insensitive?", + RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0); + rb_define_method(re2_cRegexp, "casefold?", + RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0); + rb_define_method(re2_cRegexp, "perl_classes?", + RUBY_METHOD_FUNC(re2_regexp_perl_classes), 0); + rb_define_method(re2_cRegexp, "word_boundary?", + RUBY_METHOD_FUNC(re2_regexp_word_boundary), 0); + rb_define_method(re2_cRegexp, "one_line?", + RUBY_METHOD_FUNC(re2_regexp_one_line), 0); - rb_define_module_function(re2_mRE2, "Replace", RUBY_METHOD_FUNC(re2_Replace), 3); - rb_define_module_function(re2_mRE2, "GlobalReplace", RUBY_METHOD_FUNC(re2_GlobalReplace), 3); - rb_define_module_function(re2_mRE2, "QuoteMeta", RUBY_METHOD_FUNC(re2_QuoteMeta), 1); - rb_define_singleton_method(re2_cRegexp, "escape", RUBY_METHOD_FUNC(re2_QuoteMeta), 1); - rb_define_singleton_method(re2_cRegexp, "quote", RUBY_METHOD_FUNC(re2_QuoteMeta), 1); - rb_define_singleton_method(re2_cRegexp, "compile", RUBY_METHOD_FUNC(rb_class_new_instance), -1); + rb_define_module_function(re2_mRE2, "Replace", + RUBY_METHOD_FUNC(re2_Replace), 3); + rb_define_module_function(re2_mRE2, "GlobalReplace", + RUBY_METHOD_FUNC(re2_GlobalReplace), 3); + rb_define_module_function(re2_mRE2, "QuoteMeta", + RUBY_METHOD_FUNC(re2_QuoteMeta), 1); + rb_define_singleton_method(re2_cRegexp, "escape", + RUBY_METHOD_FUNC(re2_QuoteMeta), 1); + rb_define_singleton_method(re2_cRegexp, "quote", + RUBY_METHOD_FUNC(re2_QuoteMeta), 1); + rb_define_singleton_method(re2_cRegexp, "compile", + RUBY_METHOD_FUNC(rb_class_new_instance), -1); rb_define_global_function("RE2", RUBY_METHOD_FUNC(re2_re2), -1); /* Create the symbols used in options. */ id_utf8 = rb_intern("utf8");