ext/re2/re2.cc in re2-0.5.0 vs ext/re2/re2.cc in re2-0.6.0.pre
- old
+ new
@@ -1,22 +1,39 @@
/*
* re2 (http://github.com/mudge/re2)
* Ruby bindings to re2, an "efficient, principled regular expression library"
*
- * Copyright (c) 2010-2012, Paul Mucur (http://mudge.name)
+ * Copyright (c) 2010-2013, Paul Mucur (http://mudge.name)
* Released under the BSD Licence, please see LICENSE.txt
*/
#include <re2/re2.h>
+#include <ruby.h>
#include <string>
#include <sstream>
-using namespace std;
+#include <vector>
+using std::string;
+using std::ostringstream;
+using std::nothrow;
+using std::map;
+using std::vector;
extern "C" {
+ #ifdef HAVE_RUBY_ENCODING_H
+ #include <ruby/encoding.h>
+ #define ENCODED_STR_NEW(str, length, encoding) \
+ ({ \
+ VALUE _string = rb_str_new((const char *)str, (long)length); \
+ int _enc = rb_enc_find_index((int)encoding); \
+ rb_enc_associate_index(_string, _enc); \
+ _string; \
+ })
+ #else
+ #define ENCODED_STR_NEW(str, length, encoding) \
+ rb_str_new((const char *)str, (long)length)
+ #endif
- #include <ruby.h>
-
#define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
#define UNUSED(x) ((void)x)
#ifndef RSTRING_LEN
#define RSTRING_LEN(x) (RSTRING(x)->len)
@@ -42,76 +59,170 @@
re2::StringPiece *matches;
int number_of_matches;
VALUE regexp, text;
} re2_matchdata;
- VALUE re2_mRE2, re2_cRegexp, re2_cMatchData;
+ typedef struct {
+ re2::StringPiece input;
+ int argc;
+ VALUE regexp, text;
+ } re2_consumer;
+ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cConsumer;
+
/* Symbols used in RE2 options. */
static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
id_max_mem, id_literal, id_never_nl, id_case_sensitive,
id_perl_classes, id_word_boundary, id_one_line;
- void re2_matchdata_mark(re2_matchdata* self)
- {
+ void re2_matchdata_mark(re2_matchdata* self) {
rb_gc_mark(self->regexp);
rb_gc_mark(self->text);
}
- void re2_matchdata_free(re2_matchdata* self)
- {
+ void re2_matchdata_free(re2_matchdata* self) {
if (self->matches) {
delete[] self->matches;
}
free(self);
}
- void
- re2_regexp_free(re2_pattern* self)
- {
+ void re2_consumer_mark(re2_consumer* self) {
+ rb_gc_mark(self->regexp);
+ rb_gc_mark(self->text);
+ }
+
+ void re2_consumer_free(re2_consumer* self) {
+ free(self);
+ }
+
+ void re2_regexp_free(re2_pattern* self) {
if (self->pattern) {
delete self->pattern;
}
free(self);
}
- static VALUE
- re2_matchdata_allocate(VALUE klass)
- {
+ static VALUE re2_matchdata_allocate(VALUE klass) {
re2_matchdata *m;
- return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark, re2_matchdata_free, m);
+ return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark,
+ re2_matchdata_free, m);
}
+ static VALUE re2_consumer_allocate(VALUE klass) {
+ re2_consumer *c;
+ return Data_Make_Struct(klass, re2_consumer, re2_consumer_mark,
+ re2_consumer_free, c);
+ }
+
/*
* Returns a frozen copy of the string passed into +match+.
*
* @return [String] a frozen copy of the passed string.
* @example
* m = RE2::Regexp.new('(\d+)').match("bob 123")
* m.string #=> "bob 123"
*/
- static VALUE
- re2_matchdata_string(VALUE self)
- {
+ static VALUE re2_matchdata_string(VALUE self) {
re2_matchdata *m;
Data_Get_Struct(self, re2_matchdata, m);
return m->text;
}
/*
+ * Returns the string passed into the consumer.
+ *
+ * @return [String] the original string.
+ * @example
+ * c = RE2::Regexp.new('(\d+)').consume("foo")
+ * c.string #=> "foo"
+ */
+ static VALUE re2_consumer_string(VALUE self) {
+ re2_consumer *c;
+ Data_Get_Struct(self, re2_consumer, c);
+
+ return c->text;
+ }
+
+ /*
+ * Rewind the consumer to the start of the string.
+ *
+ * @example
+ * c = RE2::Regexp.new('(\d+)').consume("1 2 3")
+ * e = c.to_enum
+ * e.next #=> ["1"]
+ * e.next #=> ["2"]
+ * c.rewind
+ * e.next #=> ["1"]
+ */
+ static VALUE re2_consumer_rewind(VALUE self) {
+ re2_consumer *c;
+ Data_Get_Struct(self, re2_consumer, c);
+ re2::StringPiece input(RSTRING_PTR(c->text));
+
+ c->input = input;
+
+ return self;
+ }
+
+ /*
+ * Scan the given text incrementally for matches, returning an array of
+ * matches on each subsequent call. Returns nil if no matches are found.
+ *
+ * @return [Array<String>] the matches.
+ * @example
+ * c = RE2::Regexp.new('(\w+)').consume("Foo bar baz")
+ * c.consume #=> ["Foo"]
+ * c.consume #=> ["bar"]
+ */
+ static VALUE re2_consumer_consume(VALUE self) {
+ int i;
+ re2_pattern *p;
+ re2_consumer *c;
+ VALUE result;
+
+ Data_Get_Struct(self, re2_consumer, c);
+ Data_Get_Struct(c->regexp, re2_pattern, p);
+
+ vector<RE2::Arg> argv(c->argc);
+ vector<RE2::Arg*> args(c->argc);
+ vector<string> matches(c->argc);
+
+ for (i = 0; i < c->argc; i++) {
+ args[i] = &argv[i];
+ argv[i] = &matches[i];
+ }
+
+ if (RE2::FindAndConsumeN(&c->input, *p->pattern, &args[0], c->argc)) {
+ result = rb_ary_new2(c->argc);
+ for (i = 0; i < c->argc; i++) {
+ if (matches[i].empty()) {
+ rb_ary_push(result, Qnil);
+ } else {
+ rb_ary_push(result, ENCODED_STR_NEW(matches[i].data(),
+ matches[i].size(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"));
+ }
+ }
+ } else {
+ result = Qnil;
+ }
+
+ return result;
+ }
+
+ /*
* Returns the number of elements in the match array (including nils).
*
* @return [Fixnum] the number of elements
* @example
* m = RE2::Regexp.new('(\d+)').match("bob 123")
* m.size #=> 2
* m.length #=> 2
*/
- static VALUE
- re2_matchdata_size(VALUE self)
- {
+ static VALUE re2_matchdata_size(VALUE self) {
re2_matchdata *m;
Data_Get_Struct(self, re2_matchdata, m);
return INT2FIX(m->number_of_matches);
}
@@ -122,21 +233,32 @@
* @return [RE2::Regexp] the regexp used in the match
* @example
* m = RE2::Regexp.new('(\d+)').match("bob 123")
* m.regexp #=> #<RE2::Regexp /(\d+)/>
*/
- static VALUE
- re2_matchdata_regexp(VALUE self)
- {
+ static VALUE re2_matchdata_regexp(VALUE self) {
re2_matchdata *m;
Data_Get_Struct(self, re2_matchdata, m);
return m->regexp;
}
- static VALUE
- re2_regexp_allocate(VALUE klass)
- {
+ /*
+ * Returns the {RE2::Regexp} used in the consumer.
+ *
+ * @return [RE2::Regexp] the regexp used in the consumer
+ * @example
+ * c = RE2::Regexp.new('(\d+)').consume("bob 123")
+ * c.regexp #=> #<RE2::Regexp /(\d+)/>
+ */
+ static VALUE re2_consumer_regexp(VALUE self) {
+ re2_consumer *c;
+ Data_Get_Struct(self, re2_consumer, c);
+
+ return c->regexp;
+ }
+
+ static VALUE re2_regexp_allocate(VALUE klass) {
re2_pattern *p;
return Data_Make_Struct(klass, re2_pattern, 0, re2_regexp_free, p);
}
/*
@@ -145,57 +267,57 @@
* @return [Array<String, nil>] the array of matches
* @example
* m = RE2::Regexp.new('(\d+)').match("bob 123")
* m.to_a #=> ["123", "123"]
*/
- static VALUE
- re2_matchdata_to_a(VALUE self)
- {
+ static VALUE re2_matchdata_to_a(VALUE self) {
int i;
re2_matchdata *m;
+ re2_pattern *p;
re2::StringPiece match;
VALUE array;
Data_Get_Struct(self, re2_matchdata, m);
+ Data_Get_Struct(m->regexp, re2_pattern, p);
array = rb_ary_new2(m->number_of_matches);
for (i = 0; i < m->number_of_matches; i++) {
if (m->matches[i].empty()) {
rb_ary_push(array, Qnil);
} else {
match = m->matches[i];
- rb_ary_push(array, rb_str_new(match.data(), match.size()));
+ rb_ary_push(array, ENCODED_STR_NEW(match.data(), match.size(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"));
}
}
return array;
}
- static VALUE
- re2_matchdata_nth_match(int nth, VALUE self)
- {
+ static VALUE re2_matchdata_nth_match(int nth, VALUE self) {
re2_matchdata *m;
+ re2_pattern *p;
re2::StringPiece match;
Data_Get_Struct(self, re2_matchdata, m);
+ Data_Get_Struct(m->regexp, re2_pattern, p);
if (nth < 0 || nth >= m->number_of_matches) {
return Qnil;
} else {
match = m->matches[nth];
if (match.empty()) {
return Qnil;
} else {
- return rb_str_new(match.data(), match.size());
+ return ENCODED_STR_NEW(match.data(), match.size(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1");
}
}
}
- static VALUE
- re2_matchdata_named_match(const char* name, VALUE self)
- {
+ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
int idx;
re2_matchdata *m;
re2_pattern *p;
map<string, int> groups;
string name_as_string(name);
@@ -254,13 +376,11 @@
* @example
* m = RE2::Regexp.new('(?P<number>\d+)').match("bob 123")
* m["number"] #=> "123"
* m[:number] #=> "123"
*/
- static VALUE
- re2_matchdata_aref(int argc, VALUE *argv, VALUE self)
- {
+ static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) {
VALUE idx, rest;
rb_scan_args(argc, argv, "11", &idx, &rest);
if (TYPE(idx) == T_STRING) {
return re2_matchdata_named_match(StringValuePtr(idx), self);
@@ -276,13 +396,11 @@
/*
* Returns the entire matched string.
*
* @return [String] the entire matched string
*/
- static VALUE
- re2_matchdata_to_s(VALUE self)
- {
+ static VALUE re2_matchdata_to_s(VALUE self) {
return re2_matchdata_nth_match(0, self);
}
/*
* Returns a printable version of the match.
@@ -290,22 +408,20 @@
* @return [String] a printable version of the match
* @example
* m = RE2::Regexp.new('(\d+)').match("bob 123")
* m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
*/
- static VALUE
- re2_matchdata_inspect(VALUE self)
- {
+ static VALUE re2_matchdata_inspect(VALUE self) {
int i;
re2_matchdata *m;
+ re2_pattern *p;
VALUE match, result;
ostringstream output;
Data_Get_Struct(self, re2_matchdata, m);
+ Data_Get_Struct(m->regexp, re2_pattern, p);
- result = rb_str_new("#<RE2::MatchData", 16);
-
output << "#<RE2::MatchData";
for (i = 0; i < m->number_of_matches; i++) {
output << " ";
@@ -322,11 +438,12 @@
}
}
output << ">";
- result = rb_str_new(output.str().data(), output.str().length());
+ result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1");
return result;
}
/*
@@ -337,13 +454,11 @@
* @param [String] pattern the pattern to compile
* @param [Hash] options the options to compile a regexp with
* @see RE2::Regexp.new
*
*/
- static VALUE
- re2_re2(int argc, VALUE *argv, VALUE self)
- {
+ static VALUE re2_re2(int argc, VALUE *argv, VALUE self) {
UNUSED(self);
return rb_class_new_instance(argc, argv, re2_cRegexp);
}
/*
@@ -356,11 +471,12 @@
* Returns a new {RE2::Regexp} object with a compiled version of
* +pattern+ stored inside with the default options.
*
* @param [String] pattern the pattern to compile
* @return [RE2::Regexp] an RE2::Regexp with the specified pattern
- * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled
+ * pattern
*
* @overload initialize(pattern, options)
* Returns a new {RE2::Regexp} object with a compiled version of
* +pattern+ stored inside with the specified options.
*
@@ -378,15 +494,13 @@
* @option options [Boolean] :word_boundary (false) allow \b \B (word boundary and not) when in posix_syntax mode
* @option options [Boolean] :one_line (false) ^ and $ only match beginning and end of text when in posix_syntax mode
* @return [RE2::Regexp] an RE2::Regexp with the specified pattern and options
* @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
*/
- static VALUE
- re2_regexp_initialize(int argc, VALUE *argv, VALUE self)
- {
+ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
VALUE pattern, options, utf8, posix_syntax, longest_match, log_errors,
- max_mem, literal, never_nl, case_sensitive, perl_classes,
+ max_mem, literal, never_nl, case_sensitive, perl_classes,
word_boundary, one_line;
re2_pattern *p;
rb_scan_args(argc, argv, "11", &pattern, &options);
Data_Get_Struct(self, re2_pattern, p);
@@ -451,13 +565,13 @@
one_line = rb_hash_aref(options, ID2SYM(id_one_line));
if (!NIL_P(one_line)) {
re2_options.set_one_line(RTEST(one_line));
}
- p->pattern = new (nothrow) RE2(StringValuePtr(pattern), re2_options);
+ p->pattern = new(nothrow) RE2(StringValuePtr(pattern), re2_options);
} else {
- p->pattern = new (nothrow) RE2(StringValuePtr(pattern));
+ p->pattern = new(nothrow) RE2(StringValuePtr(pattern));
}
if (p->pattern == 0) {
rb_raise(rb_eNoMemError, "not enough memory to allocate RE2 object");
}
@@ -471,22 +585,21 @@
* @return [String] a printable version of the regular expression
* @example
* re2 = RE2::Regexp.new("woo?")
* re2.inspect #=> "#<RE2::Regexp /woo?/>"
*/
- static VALUE
- re2_regexp_inspect(VALUE self)
- {
+ static VALUE re2_regexp_inspect(VALUE self) {
re2_pattern *p;
VALUE result;
ostringstream output;
Data_Get_Struct(self, re2_pattern, p);
output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
- result = rb_str_new(output.str().data(), output.str().length());
+ result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1");
return result;
}
/*
@@ -495,16 +608,16 @@
* @return [String] a string version of the regular expression
* @example
* re2 = RE2::Regexp.new("woo?")
* re2.to_s #=> "woo?"
*/
- static VALUE
- re2_regexp_to_s(VALUE self)
- {
+ static VALUE re2_regexp_to_s(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
- return rb_str_new(p->pattern->pattern().data(), p->pattern->pattern().size());
+ return ENCODED_STR_NEW(p->pattern->pattern().data(),
+ p->pattern->pattern().size(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1");
}
/*
* Returns whether or not the regular expression +re2+
* was compiled successfully or not.
@@ -512,13 +625,11 @@
* @return [Boolean] whether or not compilation was successful
* @example
* re2 = RE2::Regexp.new("woo?")
* re2.ok? #=> true
*/
- static VALUE
- re2_regexp_ok(VALUE self)
- {
+ static VALUE re2_regexp_ok(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->ok());
}
@@ -529,13 +640,11 @@
* @return [Boolean] the utf8 option
* @example
* re2 = RE2::Regexp.new("woo?", :utf8 => true)
* re2.utf8? #=> true
*/
- static VALUE
- re2_regexp_utf8(VALUE self)
- {
+ static VALUE re2_regexp_utf8(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().utf8());
}
@@ -546,13 +655,11 @@
* @return [Boolean] the posix_syntax option
* @example
* re2 = RE2::Regexp.new("woo?", :posix_syntax => true)
* re2.posix_syntax? #=> true
*/
- static VALUE
- re2_regexp_posix_syntax(VALUE self)
- {
+ static VALUE re2_regexp_posix_syntax(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().posix_syntax());
}
@@ -563,13 +670,11 @@
* @return [Boolean] the longest_match option
* @example
* re2 = RE2::Regexp.new("woo?", :longest_match => true)
* re2.longest_match? #=> true
*/
- static VALUE
- re2_regexp_longest_match(VALUE self)
- {
+ static VALUE re2_regexp_longest_match(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().longest_match());
}
@@ -580,13 +685,11 @@
* @return [Boolean] the log_errors option
* @example
* re2 = RE2::Regexp.new("woo?", :log_errors => true)
* re2.log_errors? #=> true
*/
- static VALUE
- re2_regexp_log_errors(VALUE self)
- {
+ static VALUE re2_regexp_log_errors(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().log_errors());
}
@@ -597,13 +700,11 @@
* @return [Fixnum] the max_mem option
* @example
* re2 = RE2::Regexp.new("woo?", :max_mem => 1024)
* re2.max_mem #=> 1024
*/
- static VALUE
- re2_regexp_max_mem(VALUE self)
- {
+ static VALUE re2_regexp_max_mem(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return INT2FIX(p->pattern->options().max_mem());
}
@@ -614,13 +715,11 @@
* @return [Boolean] the literal option
* @example
* re2 = RE2::Regexp.new("woo?", :literal => true)
* re2.literal? #=> true
*/
- static VALUE
- re2_regexp_literal(VALUE self)
- {
+ static VALUE re2_regexp_literal(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().literal());
}
@@ -631,13 +730,11 @@
* @return [Boolean] the never_nl option
* @example
* re2 = RE2::Regexp.new("woo?", :never_nl => true)
* re2.never_nl? #=> true
*/
- static VALUE
- re2_regexp_never_nl(VALUE self)
- {
+ static VALUE re2_regexp_never_nl(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().never_nl());
}
@@ -648,13 +745,11 @@
* @return [Boolean] the case_sensitive option
* @example
* re2 = RE2::Regexp.new("woo?", :case_sensitive => true)
* re2.case_sensitive? #=> true
*/
- static VALUE
- re2_regexp_case_sensitive(VALUE self)
- {
+ static VALUE re2_regexp_case_sensitive(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().case_sensitive());
}
@@ -666,13 +761,11 @@
* @example
* re2 = RE2::Regexp.new("woo?", :case_sensitive => true)
* re2.case_insensitive? #=> false
* re2.casefold? #=> false
*/
- static VALUE
- re2_regexp_case_insensitive(VALUE self)
- {
+ static VALUE re2_regexp_case_insensitive(VALUE self) {
return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue);
}
/*
* Returns whether or not the regular expression +re2+
@@ -681,13 +774,11 @@
* @return [Boolean] the perl_classes option
* @example
* re2 = RE2::Regexp.new("woo?", :perl_classes => true)
* re2.perl_classes? #=> true
*/
- static VALUE
- re2_regexp_perl_classes(VALUE self)
- {
+ static VALUE re2_regexp_perl_classes(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().perl_classes());
}
@@ -698,13 +789,11 @@
* @return [Boolean] the word_boundary option
* @example
* re2 = RE2::Regexp.new("woo?", :word_boundary => true)
* re2.word_boundary? #=> true
*/
- static VALUE
- re2_regexp_word_boundary(VALUE self)
- {
+ static VALUE re2_regexp_word_boundary(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().word_boundary());
}
@@ -715,13 +804,11 @@
* @return [Boolean] the one_line option
* @example
* re2 = RE2::Regexp.new("woo?", :one_line => true)
* re2.one_line? #=> true
*/
- static VALUE
- re2_regexp_one_line(VALUE self)
- {
+ static VALUE re2_regexp_one_line(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return BOOL2RUBY(p->pattern->options().one_line());
}
@@ -729,13 +816,11 @@
* If the RE2 could not be created properly, returns an
* error string otherwise returns nil.
*
* @return [String, nil] the error string or nil
*/
- static VALUE
- re2_regexp_error(VALUE self)
- {
+ static VALUE re2_regexp_error(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
if (p->pattern->ok()) {
return Qnil;
} else {
@@ -747,32 +832,30 @@
* If the RE2 could not be created properly, returns
* the offending portion of the regexp otherwise returns nil.
*
* @return [String, nil] the offending portion of the regexp or nil
*/
- static VALUE
- re2_regexp_error_arg(VALUE self)
- {
+ static VALUE re2_regexp_error_arg(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
if (p->pattern->ok()) {
return Qnil;
} else {
- return rb_str_new(p->pattern->error_arg().data(), p->pattern->error_arg().size());
+ return ENCODED_STR_NEW(p->pattern->error_arg().data(),
+ p->pattern->error_arg().size(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1");
}
}
/*
* Returns the program size, a very approximate measure
* of a regexp's "cost". Larger numbers are more expensive
* than smaller numbers.
*
* @return [Fixnum] the regexp "cost"
*/
- static VALUE
- re2_regexp_program_size(VALUE self)
- {
+ static VALUE re2_regexp_program_size(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return INT2FIX(p->pattern->ProgramSize());
}
@@ -780,13 +863,11 @@
* Returns a hash of the options currently set for
* +re2+.
*
* @return [Hash] the options
*/
- static VALUE
- re2_regexp_options(VALUE self)
- {
+ static VALUE re2_regexp_options(VALUE self) {
VALUE options;
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
options = rb_hash_new();
@@ -835,13 +916,11 @@
* wasn't valid on construction. The overall match ($0) does not
* count: if the regexp is "(a)(b)", returns 2.
*
* @return [Fixnum] the number of capturing subpatterns
*/
- static VALUE
- re2_regexp_number_of_capturing_groups(VALUE self)
- {
+ static VALUE re2_regexp_number_of_capturing_groups(VALUE self) {
re2_pattern *p;
Data_Get_Struct(self, re2_pattern, p);
return INT2FIX(p->pattern->NumberOfCapturingGroups());
}
@@ -849,13 +928,11 @@
/*
* Returns a hash of names to capturing indices of groups.
*
* @return [Hash] a hash of names to capturing indices
*/
- static VALUE
- re2_regexp_named_capturing_groups(VALUE self)
- {
+ static VALUE re2_regexp_named_capturing_groups(VALUE self) {
VALUE capturing_groups;
re2_pattern *p;
map<string, int> groups;
map<string, int>::iterator iterator;
@@ -863,11 +940,12 @@
groups = p->pattern->NamedCapturingGroups();
capturing_groups = rb_hash_new();
for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
rb_hash_aset(capturing_groups,
- rb_str_new(iterator->first.data(), iterator->first.size()),
+ ENCODED_STR_NEW(iterator->first.data(), iterator->first.size(),
+ p->pattern->options().utf8() ? "UTF-8" : "ISO-8859-1"),
INT2FIX(iterator->second));
}
return capturing_groups;
}
@@ -914,13 +992,11 @@
* @example
* r = RE2::Regexp.new('w(o)(o)')
* r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
* r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
*/
- static VALUE
- re2_regexp_match(int argc, VALUE *argv, VALUE self)
- {
+ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
int n;
bool matched;
re2_pattern *p;
re2_matchdata *m;
VALUE text, number_of_matches, matchdata;
@@ -934,31 +1010,35 @@
} else {
n = p->pattern->NumberOfCapturingGroups();
}
if (n == 0) {
- matched = match(p->pattern, StringValuePtr(text), 0, (int)RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
+ matched = match(p->pattern, StringValuePtr(text), 0,
+ static_cast<int>(RSTRING_LEN(text)), RE2::UNANCHORED, 0, 0);
return BOOL2RUBY(matched);
} else {
/* Because match returns the whole match as well. */
n += 1;
matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
Data_Get_Struct(matchdata, re2_matchdata, m);
- m->matches = new (nothrow) re2::StringPiece[n];
+ m->matches = new(nothrow) re2::StringPiece[n];
m->regexp = self;
m->text = rb_str_dup(text);
rb_str_freeze(m->text);
if (m->matches == 0) {
- rb_raise(rb_eNoMemError, "not enough memory to allocate StringPieces for matches");
+ rb_raise(rb_eNoMemError,
+ "not enough memory to allocate StringPieces for matches");
}
m->number_of_matches = n;
- matched = match(p->pattern, StringValuePtr(text), 0, (int)RSTRING_LEN(text), RE2::UNANCHORED, m->matches, n);
+ matched = match(p->pattern, StringValuePtr(text), 0,
+ static_cast<int>(RSTRING_LEN(text)),
+ RE2::UNANCHORED, m->matches, n);
if (matched) {
return matchdata;
} else {
return Qnil;
@@ -970,22 +1050,43 @@
* Returns true or false to indicate a successful match.
* Equivalent to +re2.match(text, 0)+.
*
* @return [Boolean] whether the match was successful
*/
- static VALUE
- re2_regexp_match_query(VALUE self, VALUE text)
- {
+ static VALUE re2_regexp_match_query(VALUE self, VALUE text) {
VALUE argv[2];
argv[0] = text;
argv[1] = INT2FIX(0);
return re2_regexp_match(2, argv, self);
}
/*
- * Replaces the first occurrence +pattern+ in +str+ with
+ * Returns a {RE2::Consumer} for scanning the given text incrementally.
+ *
+ * @example
+ * c = RE2::Regexp.new('(\w+)').consume("Foo bar baz")
+ */
+ static VALUE re2_regexp_consume(VALUE self, VALUE text) {
+ re2_pattern *p;
+ re2_consumer *c;
+ VALUE consumer;
+ re2::StringPiece input(RSTRING_PTR(text));
+
+ Data_Get_Struct(self, re2_pattern, p);
+ consumer = rb_class_new_instance(0, 0, re2_cConsumer);
+ Data_Get_Struct(consumer, re2_consumer, c);
+ c->input = input;
+ c->regexp = self;
+ c->text = text;
+ c->argc = p->pattern->NumberOfCapturingGroups();
+
+ return consumer;
+ }
+
+ /*
+ * Replaces the first occurrence +pattern+ in +str+ with
* +rewrite+ <i>in place</i>.
*
* @param [String] str the string to modify
* @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
* @param [String] rewrite the string to replace with
@@ -994,15 +1095,14 @@
* RE2.Replace("hello there", "hello", "howdy") #=> "howdy there"
* re2 = RE2.new("hel+o")
* RE2.Replace("hello there", re2, "yo") #=> "yo there"
* text = "Good morning"
* RE2.Replace(text, "morn", "even") #=> "Good evening"
- * text #=> "Good evening"
+ * text #=> "Good evening"
*/
- static VALUE
- re2_Replace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite)
- {
+ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
+ VALUE rewrite) {
/* Look out for frozen strings. */
rb_check_frozen(str);
UNUSED(self);
@@ -1015,11 +1115,12 @@
/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
Data_Get_Struct(pattern, re2_pattern, p);
RE2::Replace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
} else {
- RE2::Replace(&str_as_string, StringValuePtr(pattern), StringValuePtr(rewrite));
+ RE2::Replace(&str_as_string, StringValuePtr(pattern),
+ StringValuePtr(rewrite));
}
/* Save the replacement as a VALUE. */
repl = rb_str_new(str_as_string.data(), str_as_string.size());
@@ -1031,11 +1132,11 @@
return str;
}
/*
- * Replaces every occurrence of +pattern+ in +str+ with
+ * Replaces every occurrence of +pattern+ in +str+ with
* +rewrite+ <i>in place</i>.
*
* @param [String] str the string to modify
* @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
* @param [String] rewrite the string to replace with
@@ -1046,13 +1147,12 @@
* RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
* text = "Good morning"
* RE2.GlobalReplace(text, "o", "ee") #=> "Geeeed meerning"
* text #=> "Geeeed meerning"
*/
- static VALUE
- re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite)
- {
+ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
+ VALUE rewrite) {
/* Look out for frozen strings. */
rb_check_frozen(str);
UNUSED(self);
@@ -1065,11 +1165,12 @@
/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
Data_Get_Struct(pattern, re2_pattern, p);
RE2::GlobalReplace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
} else {
- RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern), StringValuePtr(rewrite));
+ RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern),
+ StringValuePtr(rewrite));
}
/* Save the replacement as a VALUE. */
repl = rb_str_new(str_as_string.data(), str_as_string.size());
@@ -1090,73 +1191,124 @@
* @param [String] unquoted the unquoted string
* @return [String] the escaped string
* @example
* RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?"
*/
- static VALUE
- re2_QuoteMeta(VALUE self, VALUE unquoted)
- {
+ static VALUE re2_QuoteMeta(VALUE self, VALUE unquoted) {
UNUSED(self);
string quoted_string = RE2::QuoteMeta(StringValuePtr(unquoted));
return rb_str_new(quoted_string.data(), quoted_string.size());
}
- void
- Init_re2()
- {
+ void Init_re2(void) {
re2_mRE2 = rb_define_module("RE2");
re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject);
re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject);
+ re2_cConsumer = rb_define_class_under(re2_mRE2, "Consumer", rb_cObject);
rb_define_alloc_func(re2_cRegexp, (VALUE (*)(VALUE))re2_regexp_allocate);
- rb_define_alloc_func(re2_cMatchData, (VALUE (*)(VALUE))re2_matchdata_allocate);
+ rb_define_alloc_func(re2_cMatchData,
+ (VALUE (*)(VALUE))re2_matchdata_allocate);
+ rb_define_alloc_func(re2_cConsumer,
+ (VALUE (*)(VALUE))re2_consumer_allocate);
- rb_define_method(re2_cMatchData, "string", RUBY_METHOD_FUNC(re2_matchdata_string), 0);
- rb_define_method(re2_cMatchData, "regexp", RUBY_METHOD_FUNC(re2_matchdata_regexp), 0);
- rb_define_method(re2_cMatchData, "to_a", RUBY_METHOD_FUNC(re2_matchdata_to_a), 0);
- rb_define_method(re2_cMatchData, "size", RUBY_METHOD_FUNC(re2_matchdata_size), 0);
- rb_define_method(re2_cMatchData, "length", RUBY_METHOD_FUNC(re2_matchdata_size), 0);
- rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref), -1);
- rb_define_method(re2_cMatchData, "to_s", RUBY_METHOD_FUNC(re2_matchdata_to_s), 0);
- rb_define_method(re2_cMatchData, "inspect", RUBY_METHOD_FUNC(re2_matchdata_inspect), 0);
+ rb_define_method(re2_cMatchData, "string",
+ RUBY_METHOD_FUNC(re2_matchdata_string), 0);
+ rb_define_method(re2_cMatchData, "regexp",
+ RUBY_METHOD_FUNC(re2_matchdata_regexp), 0);
+ rb_define_method(re2_cMatchData, "to_a",
+ RUBY_METHOD_FUNC(re2_matchdata_to_a), 0);
+ rb_define_method(re2_cMatchData, "size",
+ RUBY_METHOD_FUNC(re2_matchdata_size), 0);
+ rb_define_method(re2_cMatchData, "length",
+ RUBY_METHOD_FUNC(re2_matchdata_size), 0);
+ rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref),
+ -1); rb_define_method(re2_cMatchData, "to_s",
+ RUBY_METHOD_FUNC(re2_matchdata_to_s), 0);
+ rb_define_method(re2_cMatchData, "inspect",
+ RUBY_METHOD_FUNC(re2_matchdata_inspect), 0);
- rb_define_method(re2_cRegexp, "initialize", RUBY_METHOD_FUNC(re2_regexp_initialize), -1);
+ rb_define_method(re2_cConsumer, "string",
+ RUBY_METHOD_FUNC(re2_consumer_string), 0);
+ rb_define_method(re2_cConsumer, "regexp",
+ RUBY_METHOD_FUNC(re2_consumer_regexp), 0);
+ rb_define_method(re2_cConsumer, "consume",
+ RUBY_METHOD_FUNC(re2_consumer_consume), 0);
+ rb_define_method(re2_cConsumer, "rewind",
+ RUBY_METHOD_FUNC(re2_consumer_rewind), 0);
+
+ rb_define_method(re2_cRegexp, "initialize",
+ RUBY_METHOD_FUNC(re2_regexp_initialize), -1);
rb_define_method(re2_cRegexp, "ok?", RUBY_METHOD_FUNC(re2_regexp_ok), 0);
- rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error), 0);
- rb_define_method(re2_cRegexp, "error_arg", RUBY_METHOD_FUNC(re2_regexp_error_arg), 0);
- rb_define_method(re2_cRegexp, "program_size", RUBY_METHOD_FUNC(re2_regexp_program_size), 0);
- rb_define_method(re2_cRegexp, "options", RUBY_METHOD_FUNC(re2_regexp_options), 0);
- rb_define_method(re2_cRegexp, "number_of_capturing_groups", RUBY_METHOD_FUNC(re2_regexp_number_of_capturing_groups), 0);
- rb_define_method(re2_cRegexp, "named_capturing_groups", RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0);
- rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match), -1);
- rb_define_method(re2_cRegexp, "match?", RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
- rb_define_method(re2_cRegexp, "=~", RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
- rb_define_method(re2_cRegexp, "===", RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
+ rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error),
+ 0);
+ rb_define_method(re2_cRegexp, "error_arg",
+ RUBY_METHOD_FUNC(re2_regexp_error_arg), 0);
+ rb_define_method(re2_cRegexp, "program_size",
+ RUBY_METHOD_FUNC(re2_regexp_program_size), 0);
+ rb_define_method(re2_cRegexp, "options",
+ RUBY_METHOD_FUNC(re2_regexp_options), 0);
+ rb_define_method(re2_cRegexp, "number_of_capturing_groups",
+ RUBY_METHOD_FUNC(re2_regexp_number_of_capturing_groups), 0);
+ rb_define_method(re2_cRegexp, "named_capturing_groups",
+ RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0);
+ rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match),
+ -1);
+ rb_define_method(re2_cRegexp, "match?",
+ RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
+ rb_define_method(re2_cRegexp, "=~",
+ RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
+ rb_define_method(re2_cRegexp, "===",
+ RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
+ rb_define_method(re2_cRegexp, "consume", RUBY_METHOD_FUNC(re2_regexp_consume), 1);
rb_define_method(re2_cRegexp, "to_s", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
- rb_define_method(re2_cRegexp, "to_str", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
- rb_define_method(re2_cRegexp, "pattern", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
- rb_define_method(re2_cRegexp, "source", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
- rb_define_method(re2_cRegexp, "inspect", RUBY_METHOD_FUNC(re2_regexp_inspect), 0);
- rb_define_method(re2_cRegexp, "utf8?", RUBY_METHOD_FUNC(re2_regexp_utf8), 0);
- rb_define_method(re2_cRegexp, "posix_syntax?", RUBY_METHOD_FUNC(re2_regexp_posix_syntax), 0);
- rb_define_method(re2_cRegexp, "longest_match?", RUBY_METHOD_FUNC(re2_regexp_longest_match), 0);
- rb_define_method(re2_cRegexp, "log_errors?", RUBY_METHOD_FUNC(re2_regexp_log_errors), 0);
- rb_define_method(re2_cRegexp, "max_mem", RUBY_METHOD_FUNC(re2_regexp_max_mem), 0);
- rb_define_method(re2_cRegexp, "literal?", RUBY_METHOD_FUNC(re2_regexp_literal), 0);
- rb_define_method(re2_cRegexp, "never_nl?", RUBY_METHOD_FUNC(re2_regexp_never_nl), 0);
- rb_define_method(re2_cRegexp, "case_sensitive?", RUBY_METHOD_FUNC(re2_regexp_case_sensitive), 0);
- rb_define_method(re2_cRegexp, "case_insensitive?", RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
- rb_define_method(re2_cRegexp, "casefold?", RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
- rb_define_method(re2_cRegexp, "perl_classes?", RUBY_METHOD_FUNC(re2_regexp_perl_classes), 0);
- rb_define_method(re2_cRegexp, "word_boundary?", RUBY_METHOD_FUNC(re2_regexp_word_boundary), 0);
- rb_define_method(re2_cRegexp, "one_line?", RUBY_METHOD_FUNC(re2_regexp_one_line), 0);
+ rb_define_method(re2_cRegexp, "to_str", RUBY_METHOD_FUNC(re2_regexp_to_s),
+ 0);
+ rb_define_method(re2_cRegexp, "pattern", RUBY_METHOD_FUNC(re2_regexp_to_s),
+ 0);
+ rb_define_method(re2_cRegexp, "source", RUBY_METHOD_FUNC(re2_regexp_to_s),
+ 0);
+ rb_define_method(re2_cRegexp, "inspect",
+ RUBY_METHOD_FUNC(re2_regexp_inspect), 0);
+ rb_define_method(re2_cRegexp, "utf8?", RUBY_METHOD_FUNC(re2_regexp_utf8),
+ 0);
+ rb_define_method(re2_cRegexp, "posix_syntax?",
+ RUBY_METHOD_FUNC(re2_regexp_posix_syntax), 0);
+ rb_define_method(re2_cRegexp, "longest_match?",
+ RUBY_METHOD_FUNC(re2_regexp_longest_match), 0);
+ rb_define_method(re2_cRegexp, "log_errors?",
+ RUBY_METHOD_FUNC(re2_regexp_log_errors), 0);
+ rb_define_method(re2_cRegexp, "max_mem",
+ RUBY_METHOD_FUNC(re2_regexp_max_mem), 0);
+ rb_define_method(re2_cRegexp, "literal?",
+ RUBY_METHOD_FUNC(re2_regexp_literal), 0);
+ rb_define_method(re2_cRegexp, "never_nl?",
+ RUBY_METHOD_FUNC(re2_regexp_never_nl), 0);
+ rb_define_method(re2_cRegexp, "case_sensitive?",
+ RUBY_METHOD_FUNC(re2_regexp_case_sensitive), 0);
+ rb_define_method(re2_cRegexp, "case_insensitive?",
+ RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
+ rb_define_method(re2_cRegexp, "casefold?",
+ RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
+ rb_define_method(re2_cRegexp, "perl_classes?",
+ RUBY_METHOD_FUNC(re2_regexp_perl_classes), 0);
+ rb_define_method(re2_cRegexp, "word_boundary?",
+ RUBY_METHOD_FUNC(re2_regexp_word_boundary), 0);
+ rb_define_method(re2_cRegexp, "one_line?",
+ RUBY_METHOD_FUNC(re2_regexp_one_line), 0);
- rb_define_module_function(re2_mRE2, "Replace", RUBY_METHOD_FUNC(re2_Replace), 3);
- rb_define_module_function(re2_mRE2, "GlobalReplace", RUBY_METHOD_FUNC(re2_GlobalReplace), 3);
- rb_define_module_function(re2_mRE2, "QuoteMeta", RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
- rb_define_singleton_method(re2_cRegexp, "escape", RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
- rb_define_singleton_method(re2_cRegexp, "quote", RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
- rb_define_singleton_method(re2_cRegexp, "compile", RUBY_METHOD_FUNC(rb_class_new_instance), -1);
+ rb_define_module_function(re2_mRE2, "Replace",
+ RUBY_METHOD_FUNC(re2_Replace), 3);
+ rb_define_module_function(re2_mRE2, "GlobalReplace",
+ RUBY_METHOD_FUNC(re2_GlobalReplace), 3);
+ rb_define_module_function(re2_mRE2, "QuoteMeta",
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
+ rb_define_singleton_method(re2_cRegexp, "escape",
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
+ rb_define_singleton_method(re2_cRegexp, "quote",
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
+ rb_define_singleton_method(re2_cRegexp, "compile",
+ RUBY_METHOD_FUNC(rb_class_new_instance), -1);
rb_define_global_function("RE2", RUBY_METHOD_FUNC(re2_re2), -1);
/* Create the symbols used in options. */
id_utf8 = rb_intern("utf8");