# # A [regular expression](https://en.wikipedia.org/wiki/Regular_expression) (also # called a *regexp*) is a *match pattern* (also simply called a *pattern*). # # A common notation for a regexp uses enclosing slash characters: # # /foo/ # # A regexp may be applied to a *target string*; The part of the string (if any) # that matches the pattern is called a *match*, and may be said *to match*: # # re = /red/ # re.match?('redirect') # => true # Match at beginning of target. # re.match?('bored') # => true # Match at end of target. # re.match?('credit') # => true # Match within target. # re.match?('foo') # => false # No match. # # ## Regexp Uses # # A regexp may be used: # # * To extract substrings based on a given pattern: # # re = /foo/ # => /foo/ # re.match('food') # => # # re.match('good') # => nil # # See sections [Method match](rdoc-ref:Regexp@Method+match) and [Operator # =~](rdoc-ref:Regexp@Operator+-3D~). # # * To determine whether a string matches a given pattern: # # re.match?('food') # => true # re.match?('good') # => false # # See section [Method match?](rdoc-ref:Regexp@Method+match-3F). # # * As an argument for calls to certain methods in other classes and modules; # most such methods accept an argument that may be either a string or the # (much more powerful) regexp. # # See [Regexp Methods](rdoc-ref:regexp/methods.rdoc). # # # ## Regexp Objects # # A regexp object has: # # * A source; see [Sources](rdoc-ref:Regexp@Sources). # # * Several modes; see [Modes](rdoc-ref:Regexp@Modes). # # * A timeout; see [Timeouts](rdoc-ref:Regexp@Timeouts). # # * An encoding; see [Encodings](rdoc-ref:Regexp@Encodings). # # # ## Creating a Regexp # # A regular expression may be created with: # # * A regexp literal using slash characters (see [Regexp # Literals](rdoc-ref:syntax/literals.rdoc@Regexp+Literals)): # # # This is a very common usage. # /foo/ # => /foo/ # # * A `%r` regexp literal (see [%r: Regexp # Literals](rdoc-ref:syntax/literals.rdoc@25r-3A+Regexp+Literals)): # # # Same delimiter character at beginning and end; # # useful for avoiding escaping characters # %r/name\/value pair/ # => /name\/value pair/ # %r:name/value pair: # => /name\/value pair/ # %r|name/value pair| # => /name\/value pair/ # # # Certain "paired" characters can be delimiters. # %r[foo] # => /foo/ # %r{foo} # => /foo/ # %r(foo) # => /foo/ # %r # => /foo/ # # * Method Regexp.new. # # # ## Method `match` # # Each of the methods Regexp#match, String#match, and Symbol#match returns a # MatchData object if a match was found, `nil` otherwise; each also sets [global # variables](rdoc-ref:Regexp@Global+Variables): # # 'food'.match(/foo/) # => # # 'food'.match(/bar/) # => nil # # ## Operator `=~` # # Each of the operators Regexp#=~, String#=~, and Symbol#=~ returns an integer # offset if a match was found, `nil` otherwise; each also sets [global # variables](rdoc-ref:Regexp@Global+Variables): # # /bar/ =~ 'foo bar' # => 4 # 'foo bar' =~ /bar/ # => 4 # /baz/ =~ 'foo bar' # => nil # # ## Method `match?` # # Each of the methods Regexp#match?, String#match?, and Symbol#match? returns # `true` if a match was found, `false` otherwise; none sets [global # variables](rdoc-ref:Regexp@Global+Variables): # # 'food'.match?(/foo/) # => true # 'food'.match?(/bar/) # => false # # ## Global Variables # # Certain regexp-oriented methods assign values to global variables: # # * `#match`: see [Method match](rdoc-ref:Regexp@Method+match). # * `#=~`: see [Operator =~](rdoc-ref:Regexp@Operator+-3D~). # # # The affected global variables are: # # * `$~`: Returns a MatchData object, or `nil`. # * `$&`: Returns the matched part of the string, or `nil`. # * `$``: Returns the part of the string to the left of the match, or `nil`. # * `$'`: Returns the part of the string to the right of the match, or `nil`. # * `$+`: Returns the last group matched, or `nil`. # * `$1`, `$2`, etc.: Returns the first, second, etc., matched group, or # `nil`. Note that `$0` is quite different; it returns the name of the # currently executing program. # # # Examples: # # # Matched string, but no matched groups. # 'foo bar bar baz'.match('bar') # $~ # => # # $& # => "bar" # $` # => "foo " # $' # => " bar baz" # $+ # => nil # $1 # => nil # # # Matched groups. # /s(\w{2}).*(c)/.match('haystack') # $~ # => # # $& # => "stac" # $` # => "hay" # $' # => "k" # $+ # => "c" # $1 # => "ta" # $2 # => "c" # $3 # => nil # # # No match. # 'foo'.match('bar') # $~ # => nil # $& # => nil # $` # => nil # $' # => nil # $+ # => nil # $1 # => nil # # Note that Regexp#match?, String#match?, and Symbol#match? do not set global # variables. # # ## Sources # # As seen above, the simplest regexp uses a literal expression as its source: # # re = /foo/ # => /foo/ # re.match('food') # => # # re.match('good') # => nil # # A rich collection of available *subexpressions* gives the regexp great power # and flexibility: # # * [Special characters](rdoc-ref:Regexp@Special+Characters) # * [Source literals](rdoc-ref:Regexp@Source+Literals) # * [Character classes](rdoc-ref:Regexp@Character+Classes) # * [Shorthand character classes](rdoc-ref:Regexp@Shorthand+Character+Classes) # * [Anchors](rdoc-ref:Regexp@Anchors) # * [Alternation](rdoc-ref:Regexp@Alternation) # * [Quantifiers](rdoc-ref:Regexp@Quantifiers) # * [Groups and captures](rdoc-ref:Regexp@Groups+and+Captures) # * [Unicode](rdoc-ref:Regexp@Unicode) # * [POSIX Bracket Expressions](rdoc-ref:Regexp@POSIX+Bracket+Expressions) # * [Comments](rdoc-ref:Regexp@Comments) # # # ### Special Characters # # Regexp special characters, called *metacharacters*, have special meanings in # certain contexts; depending on the context, these are sometimes # metacharacters: # # . ? - + * ^ \ | $ ( ) [ ] { } # # To match a metacharacter literally, backslash-escape it: # # # Matches one or more 'o' characters. # /o+/.match('foo') # => # # # Would match 'o+'. # /o\+/.match('foo') # => nil # # To match a backslash literally, backslash-escape it: # # /\./.match('\.') # => # # /\\./.match('\.') # => # # # Method Regexp.escape returns an escaped string: # # Regexp.escape('.?-+*^\|$()[]{}') # # => "\\.\\?\\-\\+\\*\\^\\\\\\|\\$\\(\\)\\[\\]\\{\\}" # # ### Source Literals # # The source literal largely behaves like a double-quoted string; see [String # Literals](rdoc-ref:syntax/literals.rdoc@String+Literals). # # In particular, a source literal may contain interpolated expressions: # # s = 'foo' # => "foo" # /#{s}/ # => /foo/ # /#{s.capitalize}/ # => /Foo/ # /#{2 + 2}/ # => /4/ # # There are differences between an ordinary string literal and a source literal; # see [Shorthand Character # Classes](rdoc-ref:Regexp@Shorthand+Character+Classes). # # * `\s` in an ordinary string literal is equivalent to a space character; in # a source literal, it's shorthand for matching a whitespace character. # * In an ordinary string literal, these are (needlessly) escaped characters; # in a source literal, they are shorthands for various matching characters: # # \w \W \d \D \h \H \S \R # # # ### Character Classes # # A *character class* is delimited by square brackets; it specifies that certain # characters match at a given point in the target string: # # # This character class will match any vowel. # re = /B[aeiou]rd/ # re.match('Bird') # => # # re.match('Bard') # => # # re.match('Byrd') # => nil # # A character class may contain hyphen characters to specify ranges of # characters: # # # These regexps have the same effect. # /[abcdef]/.match('foo') # => # # /[a-f]/.match('foo') # => # # /[a-cd-f]/.match('foo') # => # # # When the first character of a character class is a caret (`^`), the sense of # the class is inverted: it matches any character *except* those specified. # # /[^a-eg-z]/.match('f') # => # # # A character class may contain another character class. By itself this isn't # useful because `[a-z[0-9]]` describes the same set as `[a-z0-9]`. # # However, character classes also support the `&&` operator, which performs set # intersection on its arguments. The two can be combined as follows: # # /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z)) # # This is equivalent to: # # /[abh-w]/ # # ### Shorthand Character Classes # # Each of the following metacharacters serves as a shorthand for a character # class: # # * `/./`: Matches any character except a newline: # # /./.match('foo') # => # # /./.match("\n") # => nil # # * `/./m`: Matches any character, including a newline; see [Multiline # Mode](rdoc-ref:Regexp@Multiline+Mode): # # /./m.match("\n") # => # # # * `/\w/`: Matches a word character: equivalent to `[a-zA-Z0-9_]`: # # /\w/.match(' foo') # => # # /\w/.match(' _') # => # # /\w/.match(' ') # => nil # # * `/\W/`: Matches a non-word character: equivalent to `[^a-zA-Z0-9_]`: # # /\W/.match(' ') # => # # /\W/.match('_') # => nil # # * `/\d/`: Matches a digit character: equivalent to `[0-9]`: # # /\d/.match('THX1138') # => # # /\d/.match('foo') # => nil # # * `/\D/`: Matches a non-digit character: equivalent to `[^0-9]`: # # /\D/.match('123Jump!') # => # # /\D/.match('123') # => nil # # * `/\h/`: Matches a hexdigit character: equivalent to `[0-9a-fA-F]`: # # /\h/.match('xyz fedcba9876543210') # => # # /\h/.match('xyz') # => nil # # * `/\H/`: Matches a non-hexdigit character: equivalent to `[^0-9a-fA-F]`: # # /\H/.match('fedcba9876543210xyz') # => # # /\H/.match('fedcba9876543210') # => nil # # * `/\s/`: Matches a whitespace character: equivalent to `/[ \t\r\n\f\v]/`: # # /\s/.match('foo bar') # => # # /\s/.match('foo') # => nil # # * `/\S/`: Matches a non-whitespace character: equivalent to `/[^ # \t\r\n\f\v]/`: # # /\S/.match(" \t\r\n\f\v foo") # => # # /\S/.match(" \t\r\n\f\v") # => nil # # * `/\R/`: Matches a linebreak, platform-independently: # # /\R/.match("\r") # => # # Carriage return (CR) # /\R/.match("\n") # => # # Newline (LF) # /\R/.match("\f") # => # # Formfeed (FF) # /\R/.match("\v") # => # # Vertical tab (VT) # /\R/.match("\r\n") # => # # CRLF # /\R/.match("\u0085") # => # # Next line (NEL) # /\R/.match("\u2028") # => # # Line separator (LSEP) # /\R/.match("\u2029") # => # # Paragraph separator (PSEP) # # # ### Anchors # # An anchor is a metasequence that matches a zero-width position between # characters in the target string. # # For a subexpression with no anchor, matching may begin anywhere in the target # string: # # /real/.match('surrealist') # => # # # For a subexpression with an anchor, matching must begin at the matched anchor. # # #### Boundary Anchors # # Each of these anchors matches a boundary: # # * `^`: Matches the beginning of a line: # # /^bar/.match("foo\nbar") # => # # /^ar/.match("foo\nbar") # => nil # # * `$`: Matches the end of a line: # # /bar$/.match("foo\nbar") # => # # /ba$/.match("foo\nbar") # => nil # # * `\A`: Matches the beginning of the string: # # /\Afoo/.match('foo bar') # => # # /\Afoo/.match(' foo bar') # => nil # # * `\Z`: Matches the end of the string; if string ends with a single newline, # it matches just before the ending newline: # # /foo\Z/.match('bar foo') # => # # /foo\Z/.match('foo bar') # => nil # /foo\Z/.match("bar foo\n") # => # # /foo\Z/.match("bar foo\n\n") # => nil # # * `\z`: Matches the end of the string: # # /foo\z/.match('bar foo') # => # # /foo\z/.match('foo bar') # => nil # /foo\z/.match("bar foo\n") # => nil # # * `\b`: Matches word boundary when not inside brackets; matches backspace # (`"0x08"`) when inside brackets: # # /foo\b/.match('foo bar') # => # # /foo\b/.match('foobar') # => nil # # * `\B`: Matches non-word boundary: # # /foo\B/.match('foobar') # => # # /foo\B/.match('foo bar') # => nil # # * `\G`: Matches first matching position: # # In methods like String#gsub and String#scan, it changes on each iteration. # It initially matches the beginning of subject, and in each following # iteration it matches where the last match finished. # # " a b c".gsub(/ /, '_') # => "____a_b_c" # " a b c".gsub(/\G /, '_') # => "____a b c" # # In methods like Regexp#match and String#match that take an optional # offset, it matches where the search begins. # # "hello, world".match(/,/, 3) # => # # "hello, world".match(/\G,/, 3) # => nil # # # #### Lookaround Anchors # # Lookahead anchors: # # * `(?=*pat*)`: Positive lookahead assertion: ensures that the following # characters match *pat*, but doesn't include those characters in the # matched substring. # # * `(?!*pat*)`: Negative lookahead assertion: ensures that the following # characters *do not* match *pat*, but doesn't include those characters in # the matched substring. # # # Lookbehind anchors: # # * `(?<=*pat*)`: Positive lookbehind assertion: ensures that the preceding # characters match *pat*, but doesn't include those characters in the # matched substring. # # * `(?)\w+(?=<\/b>)/.match("Fortune favors the bold.") # # => # # # #### Match-Reset Anchor # # * `\K`: Match reset: the matched content preceding `\K` in the regexp is # excluded from the result. For example, the following two regexps are # almost equivalent: # # /ab\Kc/.match('abc') # => # # /(?<=ab)c/.match('abc') # => # # # These match same string and `$&` equals `'c'`, while the matched position # is different. # # As are the following two regexps: # # /(a)\K(b)\Kc/ # /(?<=(?<=(a))(b))c/ # # # ### Alternation # # The vertical bar metacharacter (`|`) may be used within parentheses to express # alternation: two or more subexpressions any of which may match the target # string. # # Two alternatives: # # re = /(a|b)/ # re.match('foo') # => nil # re.match('bar') # => # # # Four alternatives: # # re = /(a|b|c|d)/ # re.match('shazam') # => # # re.match('cold') # => # # # Each alternative is a subexpression, and may be composed of other # subexpressions: # # re = /([a-c]|[x-z])/ # re.match('bar') # => # # re.match('ooz') # => # # # Method Regexp.union provides a convenient way to construct a regexp with # alternatives. # # ### Quantifiers # # A simple regexp matches one character: # # /\w/.match('Hello') # => # # # An added *quantifier* specifies how many matches are required or allowed: # # * `*` - Matches zero or more times: # # /\w*/.match('') # # => # # /\w*/.match('x') # # => # # /\w*/.match('xyz') # # => # # # * `+` - Matches one or more times: # # /\w+/.match('') # => nil # /\w+/.match('x') # => # # /\w+/.match('xyz') # => # # # * `?` - Matches zero or one times: # # /\w?/.match('') # => # # /\w?/.match('x') # => # # /\w?/.match('xyz') # => # # # * `{`*n*`}` - Matches exactly *n* times: # # /\w{2}/.match('') # => nil # /\w{2}/.match('x') # => nil # /\w{2}/.match('xyz') # => # # # * `{`*min*`,}` - Matches *min* or more times: # # /\w{2,}/.match('') # => nil # /\w{2,}/.match('x') # => nil # /\w{2,}/.match('xy') # => # # /\w{2,}/.match('xyz') # => # # # * `{,`*max*`}` - Matches *max* or fewer times: # # /\w{,2}/.match('') # => # # /\w{,2}/.match('x') # => # # /\w{,2}/.match('xyz') # => # # # * `{`*min*`,`*max*`}` - Matches at least *min* times and at most *max* # times: # # /\w{1,2}/.match('') # => nil # /\w{1,2}/.match('x') # => # # /\w{1,2}/.match('xyz') # => # # # # #### Greedy, Lazy, or Possessive Matching # # Quantifier matching may be greedy, lazy, or possessive: # # * In *greedy* matching, as many occurrences as possible are matched while # still allowing the overall match to succeed. Greedy quantifiers: `*`, `+`, # `?`, `{min, max}` and its variants. # * In *lazy* matching, the minimum number of occurrences are matched. Lazy # quantifiers: `*?`, `+?`, `??`, `{min, max}?` and its variants. # * In *possessive* matching, once a match is found, there is no backtracking; # that match is retained, even if it jeopardises the overall match. # Possessive quantifiers: `*+`, `++`, `?+`. Note that `{min, max}` and its # variants do *not* support possessive matching. # # # More: # # * About greedy and lazy matching, see [Choosing Minimal or Maximal # Repetition](https://doc.lagout.org/programmation/Regular%20Expressions/Reg # ular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Progr # amming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%20201 # 2-09-06%5D.pdf#tutorial-backtrack). # * About possessive matching, see [Eliminate Needless # Backtracking](https://doc.lagout.org/programmation/Regular%20Expressions/R # egular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Pro # gramming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202 # 012-09-06%5D.pdf#tutorial-backtrack). # # # ### Groups and Captures # # A simple regexp has (at most) one match: # # re = /\d\d\d\d-\d\d-\d\d/ # re.match('1943-02-04') # => # # re.match('1943-02-04').size # => 1 # re.match('foo') # => nil # # Adding one or more pairs of parentheses, `(*subexpression*)`, defines # *groups*, which may result in multiple matched substrings, called *captures*: # # re = /(\d\d\d\d)-(\d\d)-(\d\d)/ # re.match('1943-02-04') # => # # re.match('1943-02-04').size # => 4 # # The first capture is the entire matched string; the other captures are the # matched substrings from the groups. # # A group may have a [quantifier](rdoc-ref:Regexp@Quantifiers): # # re = /July 4(th)?/ # re.match('July 4') # => # # re.match('July 4th') # => # # # re = /(foo)*/ # re.match('') # => # # re.match('foo') # => # # re.match('foofoo') # => # # # re = /(foo)+/ # re.match('') # => nil # re.match('foo') # => # # re.match('foofoo') # => # # # The returned MatchData object gives access to the matched substrings: # # re = /(\d\d\d\d)-(\d\d)-(\d\d)/ # md = re.match('1943-02-04') # # => # # md[0] # => "1943-02-04" # md[1] # => "1943" # md[2] # => "02" # md[3] # => "04" # # #### Non-Capturing Groups # # A group may be made non-capturing; it is still a group (and, for example, can # have a quantifier), but its matching substring is not included among the # captures. # # A non-capturing group begins with `?:` (inside the parentheses): # # # Don't capture the year. # re = /(?:\d\d\d\d)-(\d\d)-(\d\d)/ # md = re.match('1943-02-04') # => # # # #### Backreferences # # A group match may also be referenced within the regexp itself; such a # reference is called a `backreference`: # # /[csh](..) [csh]\1 in/.match('The cat sat in the hat') # # => # # # This table shows how each subexpression in the regexp above matches a # substring in the target string: # # | Subexpression in Regexp | Matching Substring in Target String | # |---------------------------|-------------------------------------| # | First '[csh]' | Character 'c' | # | '(..)' | First substring 'at' | # | First space ' ' | First space character ' ' | # | Second '[csh]' | Character 's' | # | '\1' (backreference 'at') | Second substring 'at' | # | ' in' | Substring ' in' | # # A regexp may contain any number of groups: # # * For a large number of groups: # # * The ordinary `\*n`* notation applies only for *n* in range (1..9). # * The `MatchData[*n*]` notation applies for any non-negative *n*. # # # * `\0` is a special backreference, referring to the entire matched string; # it may not be used within the regexp itself, but may be used outside it # (for example, in a substitution method call): # # 'The cat sat in the hat'.gsub(/[csh]at/, '\0s') # # => "The cats sats in the hats" # # # #### Named Captures # # As seen above, a capture can be referred to by its number. A capture can also # have a name, prefixed as `?<*name*>` or `?'*name*'`, and the name (symbolized) # may be used as an index in `MatchData[]`: # # md = /\$(?\d+)\.(?'cents'\d+)/.match("$3.67") # # => # # md[:dollars] # => "3" # md[:cents] # => "67" # # The capture numbers are still valid. # md[2] # => "67" # # When a regexp contains a named capture, there are no unnamed captures: # # /\$(?\d+)\.(\d+)/.match("$3.67") # # => # # # A named group may be backreferenced as `\k<*name*>`: # # /(?[aeiou]).\k.\k/.match('ototomy') # # => # # # When (and only when) a regexp contains named capture groups and appears before # the `=~` operator, the captured substrings are assigned to local variables # with corresponding names: # # /\$(?\d+)\.(?\d+)/ =~ '$3.67' # dollars # => "3" # cents # => "67" # # Method Regexp#named_captures returns a hash of the capture names and # substrings; method Regexp#names returns an array of the capture names. # # #### Atomic Grouping # # A group may be made *atomic* with `(?>`*subexpression*`)`. # # This causes the subexpression to be matched independently of the rest of the # expression, so that the matched substring becomes fixed for the remainder of # the match, unless the entire subexpression must be abandoned and subsequently # revisited. # # In this way *subexpression* is treated as a non-divisible whole. Atomic # grouping is typically used to optimise patterns to prevent needless # backtracking . # # Example (without atomic grouping): # # /".*"/.match('"Quote"') # => # # # Analysis: # # 1. The leading subexpression `"` in the pattern matches the first character # `"` in the target string. # 2. The next subexpression `.*` matches the next substring `Quote“` (including # the trailing double-quote). # 3. Now there is nothing left in the target string to match the trailing # subexpression `"` in the pattern; this would cause the overall match to # fail. # 4. The matched substring is backtracked by one position: `Quote`. # 5. The final subexpression `"` now matches the final substring `"`, and the # overall match succeeds. # # # If subexpression `.*` is grouped atomically, the backtracking is disabled, and # the overall match fails: # # /"(?>.*)"/.match('"Quote"') # => nil # # Atomic grouping can affect performance; see [Atomic # Group](https://www.regular-expressions.info/atomic.html). # # #### Subexpression Calls # # As seen above, a backreference number (`\*n`*) or name (`\k<*name*>`) gives # access to a captured *substring*; the corresponding regexp *subexpression* may # also be accessed, via the number (`\\g*n`*) or name (`\g<*name*>`): # # /\A(?\(\g*\))*\z/.match('(())') # # ^1 # # ^2 # # ^3 # # ^4 # # ^5 # # ^6 # # ^7 # # ^8 # # ^9 # # ^10 # # The pattern: # # 1. Matches at the beginning of the string, i.e. before the first character. # 2. Enters a named group `paren`. # 3. Matches the first character in the string, `'('`. # 4. Calls the `paren` group again, i.e. recurses back to the second step. # 5. Re-enters the `paren` group. # 6. Matches the second character in the string, `'('`. # 7. Attempts to call `paren` a third time, but fails because doing so would # prevent an overall successful match. # 8. Matches the third character in the string, `')'`; marks the end of the # second recursive call # 9. Matches the fourth character in the string, `')'`. # 10. Matches the end of the string. # # # See [Subexpression # calls](https://learnbyexample.github.io/Ruby_Regexp/groupings-and-backreferenc # es.html?highlight=subexpression#subexpression-calls). # # #### Conditionals # # The conditional construct takes the form `(?(*cond*)*yes*|*no*)`, where: # # * *cond* may be a capture number or name. # * The match to be applied is *yes* if *cond* is captured; otherwise the # match to be applied is *no*. # * If not needed, `|*no`* may be omitted. # # # Examples: # # re = /\A(foo)?(?(1)(T)|(F))\z/ # re.match('fooT') # => # # re.match('F') # => # # re.match('fooF') # => nil # re.match('T') # => nil # # re = /\A(?foo)?(?()(T)|(F))\z/ # re.match('fooT') # => # # re.match('F') # => # # re.match('fooF') # => nil # re.match('T') # => nil # # #### Absence Operator # # The absence operator is a special group that matches anything which does *not* # match the contained subexpressions. # # /(?~real)/.match('surrealist') # => # # /(?~real)ist/.match('surrealist') # => # # /sur(?~real)ist/.match('surrealist') # => nil # # ### Unicode # # #### Unicode Properties # # The `/\p{*property_name*}/` construct (with lowercase `p`) matches characters # using a Unicode property name, much like a character class; property `Alpha` # specifies alphabetic characters: # # /\p{Alpha}/.match('a') # => # # /\p{Alpha}/.match('1') # => nil # # A property can be inverted by prefixing the name with a caret character (`^`): # # /\p{^Alpha}/.match('1') # => # # /\p{^Alpha}/.match('a') # => nil # # Or by using `\P` (uppercase `P`): # # /\P{Alpha}/.match('1') # => # # /\P{Alpha}/.match('a') # => nil # # See [Unicode Properties](rdoc-ref:regexp/unicode_properties.rdoc) for regexps # based on the numerous properties. # # Some commonly-used properties correspond to POSIX bracket expressions: # # * `/\p{Alnum}/`: Alphabetic and numeric character # * `/\p{Alpha}/`: Alphabetic character # * `/\p{Blank}/`: Space or tab # * `/\p{Cntrl}/`: Control character # * `/\p{Digit}/`: Digit characters, and similar) # * `/\p{Lower}/`: Lowercase alphabetical character # * `/\p{Print}/`: Like `\p{Graph}`, but includes the space character # * `/\p{Punct}/`: Punctuation character # * `/\p{Space}/`: Whitespace character (`[:blank:]`, newline, carriage # return, etc.) # * `/\p{Upper}/`: Uppercase alphabetical # * `/\p{XDigit}/`: Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F) # # # These are also commonly used: # # * `/\p{Emoji}/`: Unicode emoji. # * `/\p{Graph}/`: Non-blank character (excludes spaces, control characters, # and similar). # * `/\p{Word}/`: A member in one of these Unicode character categories (see # below) or having one of these Unicode properties: # # * Unicode categories: # * `Mark` (`M`). # * `Decimal Number` (`Nd`) # * `Connector Punctuation` (`Pc`). # # # * Unicode properties: # * `Alpha` # * `Join_Control` # # # # * `/\p{ASCII}/`: A character in the ASCII character set. # * `/\p{Any}/`: Any Unicode character (including unassigned characters). # * `/\p{Assigned}/`: An assigned character. # # # #### Unicode Character Categories # # A Unicode character category name: # # * May be either its full name or its abbreviated name. # * Is case-insensitive. # * Treats a space, a hyphen, and an underscore as equivalent. # # # Examples: # # /\p{lu}/ # => /\p{lu}/ # /\p{LU}/ # => /\p{LU}/ # /\p{Uppercase Letter}/ # => /\p{Uppercase Letter}/ # /\p{Uppercase_Letter}/ # => /\p{Uppercase_Letter}/ # /\p{UPPERCASE-LETTER}/ # => /\p{UPPERCASE-LETTER}/ # # Below are the Unicode character category abbreviations and names. Enumerations # of characters in each category are at the links. # # Letters: # # * `L`, `Letter`: `LC`, `Lm`, or `Lo`. # * `LC`, `Cased_Letter`: `Ll`, `Lt`, or `Lu`. # * [Lu, Lowercase_Letter](https://www.compart.com/en/unicode/category/Ll). # * [Lu, Modifier_Letter](https://www.compart.com/en/unicode/category/Lm). # * [Lu, Other_Letter](https://www.compart.com/en/unicode/category/Lo). # * [Lu, Titlecase_Letter](https://www.compart.com/en/unicode/category/Lt). # * [Lu, Uppercase_Letter](https://www.compart.com/en/unicode/category/Lu). # # # Marks: # # * `M`, `Mark`: `Mc`, `Me`, or `Mn`. # * [Mc, Spacing_Mark](https://www.compart.com/en/unicode/category/Mc). # * [Me, Enclosing_Mark](https://www.compart.com/en/unicode/category/Me). # * [Mn, Nonapacing_Mark](https://www.compart.com/en/unicode/category/Mn). # # # Numbers: # # * `N`, `Number`: `Nd`, `Nl`, or `No`. # * [Nd, Decimal_Number](https://www.compart.com/en/unicode/category/Nd). # * [Nl, Letter_Number](https://www.compart.com/en/unicode/category/Nl). # * [No, Other_Number](https://www.compart.com/en/unicode/category/No). # # # Punctation: # # * `P`, `Punctuation`: `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`. # * [Pc, # Connector_Punctuation](https://www.compart.com/en/unicode/category/Pc). # * [Pd, Dash_Punctuation](https://www.compart.com/en/unicode/category/Pd). # * [Pe, Close_Punctuation](https://www.compart.com/en/unicode/category/Pe). # * [Pf, Final_Punctuation](https://www.compart.com/en/unicode/category/Pf). # * [Pi, Initial_Punctuation](https://www.compart.com/en/unicode/category/Pi). # * [Po, Other_Punctuation](https://www.compart.com/en/unicode/category/Po). # * [Ps, Open_Punctuation](https://www.compart.com/en/unicode/category/Ps). # # * `S`, `Symbol`: `Sc`, `Sk`, `Sm`, or `So`. # * [Sc, Currency_Symbol](https://www.compart.com/en/unicode/category/Sc). # * [Sk, Modifier_Symbol](https://www.compart.com/en/unicode/category/Sk). # * [Sm, Math_Symbol](https://www.compart.com/en/unicode/category/Sm). # * [So, Other_Symbol](https://www.compart.com/en/unicode/category/So). # # * `Z`, `Separator`: `Zl`, `Zp`, or `Zs`. # * [Zl, Line_Separator](https://www.compart.com/en/unicode/category/Zl). # * [Zp, Paragraph_Separator](https://www.compart.com/en/unicode/category/Zp). # * [Zs, Space_Separator](https://www.compart.com/en/unicode/category/Zs). # # * `C`, `Other`: `Cc`, `Cf`, `Cn`, `Co`, or `Cs`. # * [Cc, Control](https://www.compart.com/en/unicode/category/Cc). # * [Cf, Format](https://www.compart.com/en/unicode/category/Cf). # * [Cn, Unassigned](https://www.compart.com/en/unicode/category/Cn). # * [Co, Private_Use](https://www.compart.com/en/unicode/category/Co). # * [Cs, Surrogate](https://www.compart.com/en/unicode/category/Cs). # # # #### Unicode Scripts and Blocks # # Among the Unicode properties are: # # * [Unicode scripts](https://en.wikipedia.org/wiki/Script_(Unicode)); see # [supported scripts](https://www.unicode.org/standard/supported.html). # * [Unicode blocks](https://en.wikipedia.org/wiki/Unicode_block); see # [supported blocks](http://www.unicode.org/Public/UNIDATA/Blocks.txt). # # # ### POSIX Bracket Expressions # # A POSIX *bracket expression* is also similar to a character class. These # expressions provide a portable alternative to the above, with the added # benefit of encompassing non-ASCII characters: # # * `/\d/` matches only ASCII decimal digits `0` through `9`. # * `/[[:digit:]]/` matches any character in the Unicode `Decimal Number` # (`Nd`) category; see below. # # # The POSIX bracket expressions: # # * `/[[:digit:]]/`: Matches a [Unicode # digit](https://www.compart.com/en/unicode/category/Nd): # # /[[:digit:]]/.match('9') # => # # /[[:digit:]]/.match("\u1fbf9") # => # # # * `/[[:xdigit:]]/`: Matches a digit allowed in a hexadecimal number; # equivalent to `[0-9a-fA-F]`. # # * `/[[:upper:]]/`: Matches a [Unicode uppercase # letter](https://www.compart.com/en/unicode/category/Lu): # # /[[:upper:]]/.match('A') # => # # /[[:upper:]]/.match("\u00c6") # => # # # * `/[[:lower:]]/`: Matches a [Unicode lowercase # letter](https://www.compart.com/en/unicode/category/Ll): # # /[[:lower:]]/.match('a') # => # # /[[:lower:]]/.match("\u01fd") # => # # # * `/[[:alpha:]]/`: Matches `/[[:upper:]]/` or `/[[:lower:]]/`. # # * `/[[:alnum:]]/`: Matches `/[[:alpha:]]/` or `/[[:digit:]]/`. # # * `/[[:space:]]/`: Matches [Unicode space # character](https://www.compart.com/en/unicode/category/Zs): # # /[[:space:]]/.match(' ') # => # # /[[:space:]]/.match("\u2005") # => # # # * `/[[:blank:]]/`: Matches `/[[:space:]]/` or tab character: # # /[[:blank:]]/.match(' ') # => # # /[[:blank:]]/.match("\u2005") # => # # /[[:blank:]]/.match("\t") # => # # # * `/[[:cntrl:]]/`: Matches [Unicode control # character](https://www.compart.com/en/unicode/category/Cc): # # /[[:cntrl:]]/.match("\u0000") # => # # /[[:cntrl:]]/.match("\u009f") # => # # # * `/[[:graph:]]/`: Matches any character except `/[[:space:]]/` or # `/[[:cntrl:]]/`. # # * `/[[:print:]]/`: Matches `/[[:graph:]]/` or space character. # # * `/[[:punct:]]/`: Matches any (Unicode punctuation # character}[https://www.compart.com/en/unicode/category/Po]: # # # Ruby also supports these (non-POSIX) bracket expressions: # # * `/[[:ascii:]]/`: Matches a character in the ASCII character set. # * `/[[:word:]]/`: Matches a character in one of these Unicode character # categories or having one of these Unicode properties: # # * Unicode categories: # * `Mark` (`M`). # * `Decimal Number` (`Nd`) # * `Connector Punctuation` (`Pc`). # # # * Unicode properties: # * `Alpha` # * `Join_Control` # # # # # ### Comments # # A comment may be included in a regexp pattern using the `(?#`*comment*`)` # construct, where *comment* is a substring that is to be ignored. arbitrary # text ignored by the regexp engine: # # /foo(?#Ignore me)bar/.match('foobar') # => # # # The comment may not include an unescaped terminator character. # # See also [Extended Mode](rdoc-ref:Regexp@Extended+Mode). # # ## Modes # # Each of these modifiers sets a mode for the regexp: # # * `i`: `/*pattern*/i` sets [Case-Insensitive # Mode](rdoc-ref:Regexp@Case-Insensitive+Mode). # * `m`: `/*pattern*/m` sets [Multiline Mode](rdoc-ref:Regexp@Multiline+Mode). # * `x`: `/*pattern*/x` sets [Extended Mode](rdoc-ref:Regexp@Extended+Mode). # * `o`: `/*pattern*/o` sets [Interpolation # Mode](rdoc-ref:Regexp@Interpolation+Mode). # # # Any, all, or none of these may be applied. # # Modifiers `i`, `m`, and `x` may be applied to subexpressions: # # * `(?*modifier*)` turns the mode "on" for ensuing subexpressions # * `(?-*modifier*)` turns the mode "off" for ensuing subexpressions # * `(?*modifier*:*subexp*)` turns the mode "on" for *subexp* within the group # * `(?-*modifier*:*subexp*)` turns the mode "off" for *subexp* within the # group # # # Example: # # re = /(?i)te(?-i)st/ # re.match('test') # => # # re.match('TEst') # => # # re.match('TEST') # => nil # re.match('teST') # => nil # # re = /t(?i:e)st/ # re.match('test') # => # # re.match('tEst') # => # # re.match('tEST') # => nil # # Method Regexp#options returns an integer whose value showing the settings for # case-insensitivity mode, multiline mode, and extended mode. # # ### Case-Insensitive Mode # # By default, a regexp is case-sensitive: # # /foo/.match('FOO') # => nil # # Modifier `i` enables case-insensitive mode: # # /foo/i.match('FOO') # # => # # # Method Regexp#casefold? returns whether the mode is case-insensitive. # # ### Multiline Mode # # The multiline-mode in Ruby is what is commonly called a "dot-all mode": # # * Without the `m` modifier, the subexpression `.` does not match newlines: # # /a.c/.match("a\nc") # => nil # # * With the modifier, it does match: # # /a.c/m.match("a\nc") # => # # # # Unlike other languages, the modifier `m` does not affect the anchors `^` and # `$`. These anchors always match at line-boundaries in Ruby. # # ### Extended Mode # # Modifier `x` enables extended mode, which means that: # # * Literal white space in the pattern is to be ignored. # * Character `#` marks the remainder of its containing line as a comment, # which is also to be ignored for matching purposes. # # # In extended mode, whitespace and comments may be used to form a # self-documented regexp. # # Regexp not in extended mode (matches some Roman numerals): # # pattern = '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$' # re = /#{pattern}/ # re.match('MCMXLIII') # => # # # Regexp in extended mode: # # pattern = <<-EOT # ^ # beginning of string # M{0,3} # thousands - 0 to 3 Ms # (CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 Cs), # # or 500-800 (D, followed by 0 to 3 Cs) # (XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 Xs), # # or 50-80 (L, followed by 0 to 3 Xs) # (IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 Is), # # or 5-8 (V, followed by 0 to 3 Is) # $ # end of string # EOT # re = /#{pattern}/x # re.match('MCMXLIII') # => # # # ### Interpolation Mode # # Modifier `o` means that the first time a literal regexp with interpolations is # encountered, the generated Regexp object is saved and used for all future # evaluations of that literal regexp. Without modifier `o`, the generated Regexp # is not saved, so each evaluation of the literal regexp generates a new Regexp # object. # # Without modifier `o`: # # def letters; sleep 5; /[A-Z][a-z]/; end # words = %w[abc def xyz] # start = Time.now # words.each {|word| word.match(/\A[#{letters}]+\z/) } # Time.now - start # => 15.0174892 # # With modifier `o`: # # start = Time.now # words.each {|word| word.match(/\A[#{letters}]+\z/o) } # Time.now - start # => 5.0010866 # # Note that if the literal regexp does not have interpolations, the `o` behavior # is the default. # # ## Encodings # # By default, a regexp with only US-ASCII characters has US-ASCII encoding: # # re = /foo/ # re.source.encoding # => # # re.encoding # => # # # A regular expression containing non-US-ASCII characters is assumed to use the # source encoding. This can be overridden with one of the following modifiers. # # * `/*pat*/n`: US-ASCII if only containing US-ASCII characters, otherwise # ASCII-8BIT: # # /foo/n.encoding # => # # /foo\xff/n.encoding # => # # /foo\x7f/n.encoding # => # # # * `/*pat*/u`: UTF-8 # # /foo/u.encoding # => # # # * `/*pat*/e`: EUC-JP # # /foo/e.encoding # => # # # * `/*pat*/s`: Windows-31J # # /foo/s.encoding # => # # # # A regexp can be matched against a target string when either: # # * They have the same encoding. # * The regexp's encoding is a fixed encoding and the string contains only # ASCII characters. Method Regexp#fixed_encoding? returns whether the regexp # has a *fixed* encoding. # # # If a match between incompatible encodings is attempted an # `Encoding::CompatibilityError` exception is raised. # # Example: # # re = eval("# encoding: ISO-8859-1\n/foo\\xff?/") # re.encoding # => # # re =~ "foo".encode("UTF-8") # => 0 # re =~ "foo\u0100" # Raises Encoding::CompatibilityError # # The encoding may be explicitly fixed by including Regexp::FIXEDENCODING in the # second argument for Regexp.new: # # # Regexp with encoding ISO-8859-1. # re = Regexp.new("a".force_encoding('iso-8859-1'), Regexp::FIXEDENCODING) # re.encoding # => # # # Target string with encoding UTF-8. # s = "a\u3042" # s.encoding # => # # re.match(s) # Raises Encoding::CompatibilityError. # # ## Timeouts # # When either a regexp source or a target string comes from untrusted input, # malicious values could become a denial-of-service attack; to prevent such an # attack, it is wise to set a timeout. # # Regexp has two timeout values: # # * A class default timeout, used for a regexp whose instance timeout is # `nil`; this default is initially `nil`, and may be set by method # Regexp.timeout=: # # Regexp.timeout # => nil # Regexp.timeout = 3.0 # Regexp.timeout # => 3.0 # # * An instance timeout, which defaults to `nil` and may be set in Regexp.new: # # re = Regexp.new('foo', timeout: 5.0) # re.timeout # => 5.0 # # # When regexp.timeout is `nil`, the timeout "falls through" to Regexp.timeout; # when regexp.timeout is non-`nil`, that value controls timing out: # # | regexp.timeout Value | Regexp.timeout Value | Result | # |----------------------|----------------------|-----------------------------| # | nil | nil | Never times out. | # | nil | Float | Times out in Float seconds. | # | Float | Any | Times out in Float seconds. | # # ## Optimization # # For certain values of the pattern and target string, matching time can grow # polynomially or exponentially in relation to the input size; the potential # vulnerability arising from this is the [regular expression # denial-of-service](https://en.wikipedia.org/wiki/ReDoS) (ReDoS) attack. # # Regexp matching can apply an optimization to prevent ReDoS attacks. When the # optimization is applied, matching time increases linearly (not polynomially or # exponentially) in relation to the input size, and a ReDoS attach is not # possible. # # This optimization is applied if the pattern meets these criteria: # # * No backreferences. # * No subexpression calls. # * No nested lookaround anchors or atomic groups. # * No nested quantifiers with counting (i.e. no nested `{n}`, `{min,}`, # `{,max}`, or `{min,max}` style quantifiers) # # # You can use method Regexp.linear_time? to determine whether a pattern meets # these criteria: # # Regexp.linear_time?(/a*/) # => true # Regexp.linear_time?('a*') # => true # Regexp.linear_time?(/(a*)\1/) # => false # # However, an untrusted source may not be safe even if the method returns # `true`, because the optimization uses memoization (which may invoke large # memory consumption). # # ## References # # Read (online PDF books): # # * [Mastering Regular # Expressions](https://ia902508.us.archive.org/10/items/allitebooks-02/Maste # ring%20Regular%20Expressions%2C%203rd%20Edition.pdf) by Jeffrey E.F. # Friedl. # * [Regular Expressions # Cookbook](https://doc.lagout.org/programmation/Regular%20Expressions/Regul # ar%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Program # ming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202012- # 09-06%5D.pdf) by Jan Goyvaerts & Steven Levithan. # # # Explore, test (interactive online editor): # # * [Rubular](https://rubular.com/). # class Regexp # Represents an object's ability to be converted to a `Regexp`. # # This is only used in `Regexp.try_convert` and `Regexp.union` within the standard library. interface _ToRegexp # Converts `self` to a `Regexp`. def to_regexp: () -> Regexp end class TimeoutError < RegexpError end # # see Regexp.options and Regexp.new # EXTENDED: Integer # # see Regexp.options and Regexp.new # FIXEDENCODING: Integer # # see Regexp.options and Regexp.new # IGNORECASE: Integer # # see Regexp.options and Regexp.new # MULTILINE: Integer # # see Regexp.options and Regexp.new # NOENCODING: Integer # # Alias for Regexp.new # alias self.compile self.new # # Returns a new string that escapes any characters that have special meaning in # a regular expression: # # s = Regexp.escape('\*?{}.') # => "\\\\\\*\\?\\{\\}\\." # # For any string `s`, this call returns a MatchData object: # # r = Regexp.new(Regexp.escape(s)) # => /\\\\\\\*\\\?\\\{\\\}\\\./ # r.match(s) # => # # def self.escape: (interned str) -> String # # With no argument, returns the value of `$!`, which is the result of the most # recent pattern match (see [Regexp global # variables](rdoc-ref:Regexp@Global+Variables)): # # /c(.)t/ =~ 'cat' # => 0 # Regexp.last_match # => # # /a/ =~ 'foo' # => nil # Regexp.last_match # => nil # # With non-negative integer argument `n`, returns the _n_th field in the # matchdata, if any, or nil if none: # # /c(.)t/ =~ 'cat' # => 0 # Regexp.last_match(0) # => "cat" # Regexp.last_match(1) # => "a" # Regexp.last_match(2) # => nil # # With negative integer argument `n`, counts backwards from the last field: # # Regexp.last_match(-1) # => "a" # # With string or symbol argument `name`, returns the string value for the named # capture, if any: # # /(?\w+)\s*=\s*(?\w+)/ =~ 'var = val' # Regexp.last_match # => # # Regexp.last_match(:lhs) # => "var" # Regexp.last_match('rhs') # => "val" # Regexp.last_match('foo') # Raises IndexError. # def self.last_match: () -> MatchData? | (MatchData::capture capture) -> String? # # Returns `true` if matching against `re` can be done in linear time to the # input string. # # Regexp.linear_time?(/re/) # => true # # Note that this is a property of the ruby interpreter, not of the argument # regular expression. Identical regexp can or cannot run in linear time # depending on your ruby binary. Neither forward nor backward compatibility is # guaranteed about the return value of this method. Our current algorithm is # (*1) but this is subject to change in the future. Alternative implementations # can also behave differently. They might always return false for everything. # # (*1): https://doi.org/10.1109/SP40001.2021.00032 # def self.linear_time?: (Regexp regex, ?nil, ?timeout: untyped) -> bool | (string regex, ?int | string | bool | nil options, ?timeout: untyped) -> bool # # Returns a new string that escapes any characters that have special meaning in # a regular expression: # # s = Regexp.escape('\*?{}.') # => "\\\\\\*\\?\\{\\}\\." # # For any string `s`, this call returns a MatchData object: # # r = Regexp.new(Regexp.escape(s)) # => /\\\\\\\*\\\?\\\{\\\}\\\./ # r.match(s) # => # # alias self.quote self.escape # # Returns `object` if it is a regexp: # # Regexp.try_convert(/re/) # => /re/ # # Otherwise if `object` responds to `:to_regexp`, calls `object.to_regexp` and # returns the result. # # Returns `nil` if `object` does not respond to `:to_regexp`. # # Regexp.try_convert('re') # => nil # # Raises an exception unless `object.to_regexp` returns a regexp. # def self.try_convert: (Regexp | _ToRegexp regexp_like) -> Regexp | (untyped other) -> Regexp? # # It returns the current default timeout interval for Regexp matching in second. # `nil` means no default timeout configuration. # def self.timeout: () -> Float? # # It sets the default timeout interval for Regexp matching in second. `nil` # means no default timeout configuration. This configuration is process-global. # If you want to set timeout for each Regexp, use `timeout` keyword for # `Regexp.new`. # # Regexp.timeout = 1 # /^a*b?a*$/ =~ "a" * 100000 + "x" #=> regexp match timeout (RuntimeError) # def self.timeout=: [T < _ToF] (T timeout) -> T # # Returns a new regexp that is the union of the given patterns: # # r = Regexp.union(%w[cat dog]) # => /cat|dog/ # r.match('cat') # => # # r.match('dog') # => # # r.match('cog') # => nil # # For each pattern that is a string, `Regexp.new(pattern)` is used: # # Regexp.union('penzance') # => /penzance/ # Regexp.union('a+b*c') # => /a\+b\*c/ # Regexp.union('skiing', 'sledding') # => /skiing|sledding/ # Regexp.union(['skiing', 'sledding']) # => /skiing|sledding/ # # For each pattern that is a regexp, it is used as is, including its flags: # # Regexp.union(/foo/i, /bar/m, /baz/x) # # => /(?i-mx:foo)|(?m-ix:bar)|(?x-mi:baz)/ # Regexp.union([/foo/i, /bar/m, /baz/x]) # # => /(?i-mx:foo)|(?m-ix:bar)|(?x-mi:baz)/ # # With no arguments, returns `/(?!)/`: # # Regexp.union # => /(?!)/ # # If any regexp pattern contains captures, the behavior is unspecified. # def self.union: (*Regexp | _ToRegexp | string patterns) -> Regexp | (array[Regexp | _ToRegexp | string] patterns) -> Regexp | (Symbol | [Symbol] symbol_pattern) -> Regexp # # With argument `string` given, returns a new regexp with the given string and # options: # # r = Regexp.new('foo') # => /foo/ # r.source # => "foo" # r.options # => 0 # # Optional argument `options` is one of the following: # # * A String of options: # # Regexp.new('foo', 'i') # => /foo/i # Regexp.new('foo', 'im') # => /foo/im # # * The bit-wise OR of one or more of the constants Regexp::EXTENDED, # Regexp::IGNORECASE, Regexp::MULTILINE, and Regexp::NOENCODING: # # Regexp.new('foo', Regexp::IGNORECASE) # => /foo/i # Regexp.new('foo', Regexp::EXTENDED) # => /foo/x # Regexp.new('foo', Regexp::MULTILINE) # => /foo/m # Regexp.new('foo', Regexp::NOENCODING) # => /foo/n # flags = Regexp::IGNORECASE | Regexp::EXTENDED | Regexp::MULTILINE # Regexp.new('foo', flags) # => /foo/mix # # * `nil` or `false`, which is ignored. # * Any other truthy value, in which case the regexp will be case-insensitive. # # # If optional keyword argument `timeout` is given, its float value overrides the # timeout interval for the class, Regexp.timeout. If `nil` is passed as # +timeout, it uses the timeout interval for the class, Regexp.timeout. # # With argument `regexp` given, returns a new regexp. The source, options, # timeout are the same as `regexp`. `options` and `n_flag` arguments are # ineffective. The timeout can be overridden by `timeout` keyword. # # options = Regexp::MULTILINE # r = Regexp.new('foo', options, timeout: 1.1) # => /foo/m # r2 = Regexp.new(r) # => /foo/m # r2.timeout # => 1.1 # r3 = Regexp.new(r, timeout: 3.14) # => /foo/m # r3.timeout # => 3.14 # def initialize: (Regexp regexp, ?timeout: _ToF?) -> void | (string pattern, ?int | string | bool | nil options, ?timeout: _ToF?) -> void def initialize_copy: (self object) -> self # # Returns `true` if `object` is another Regexp whose pattern, flags, and # encoding are the same as `self`, `false` otherwise: # # /foo/ == Regexp.new('foo') # => true # /foo/ == /foo/i # => false # /foo/ == Regexp.new('food') # => false # /foo/ == Regexp.new("abc".force_encoding("euc-jp")) # => false # def ==: (untyped other) -> bool # # Returns `true` if `self` finds a match in `string`: # # /^[a-z]*$/ === 'HELLO' # => false # /^[A-Z]*$/ === 'HELLO' # => true # # This method is called in case statements: # # s = 'HELLO' # case s # when /\A[a-z]*\z/; print "Lower case\n" # when /\A[A-Z]*\z/; print "Upper case\n" # else print "Mixed case\n" # end # => "Upper case" # def ===: (untyped other) -> bool # # Returns the integer index (in characters) of the first match for `self` and # `string`, or `nil` if none; also sets the [rdoc-ref:Regexp global # variables](rdoc-ref:Regexp@Global+Variables): # # /at/ =~ 'input data' # => 7 # $~ # => # # /ax/ =~ 'input data' # => nil # $~ # => nil # # Assigns named captures to local variables of the same names if and only if # `self`: # # * Is a regexp literal; see [Regexp # Literals](rdoc-ref:literals.rdoc@Regexp+Literals). # * Does not contain interpolations; see [Regexp # interpolation](rdoc-ref:Regexp@Interpolation+Mode). # * Is at the left of the expression. # # # Example: # # /(?\w+)\s*=\s*(?\w+)/ =~ ' x = y ' # p lhs # => "x" # p rhs # => "y" # # Assigns `nil` if not matched: # # /(?\w+)\s*=\s*(?\w+)/ =~ ' x = ' # p lhs # => nil # p rhs # => nil # # Does not make local variable assignments if `self` is not a regexp literal: # # r = /(?\w+)\s*=\s*(?\w+)/ # r =~ ' x = y ' # p foo # Undefined local variable # p bar # Undefined local variable # # The assignment does not occur if the regexp is not at the left: # # ' x = y ' =~ /(?\w+)\s*=\s*(?\w+)/ # p foo, foo # Undefined local variables # # A regexp interpolation, `#{}`, also disables the assignment: # # r = /(?\w+)/ # /(?\w+)\s*=\s*#{r}/ =~ 'x = y' # p foo # Undefined local variable # def =~: (interned? string) -> Integer? | (nil) -> nil # # Returns `true` if the case-insensitivity flag in `self` is set, `false` # otherwise: # # /a/.casefold? # => false # /a/i.casefold? # => true # /(?i:a)/.casefold? # => false # def casefold?: () -> bool # # Returns the Encoding object that represents the encoding of obj. # def encoding: () -> Encoding # # Returns `true` if `object` is another Regexp whose pattern, flags, and # encoding are the same as `self`, `false` otherwise: # # /foo/ == Regexp.new('foo') # => true # /foo/ == /foo/i # => false # /foo/ == Regexp.new('food') # => false # /foo/ == Regexp.new("abc".force_encoding("euc-jp")) # => false # alias eql? == # # Returns `false` if `self` is applicable to a string with any ASCII-compatible # encoding; otherwise returns `true`: # # r = /a/ # => /a/ # r.fixed_encoding? # => false # r.match?("\u{6666} a") # => true # r.match?("\xa1\xa2 a".force_encoding("euc-jp")) # => true # r.match?("abc".force_encoding("euc-jp")) # => true # # r = /a/u # => /a/ # r.fixed_encoding? # => true # r.match?("\u{6666} a") # => true # r.match?("\xa1\xa2".force_encoding("euc-jp")) # Raises exception. # r.match?("abc".force_encoding("euc-jp")) # => true # # r = /\u{6666}/ # => /\u{6666}/ # r.fixed_encoding? # => true # r.encoding # => # # r.match?("\u{6666} a") # => true # r.match?("\xa1\xa2".force_encoding("euc-jp")) # Raises exception. # r.match?("abc".force_encoding("euc-jp")) # => false # def fixed_encoding?: () -> bool # # Returns the integer hash value for `self`. # # Related: Object#hash. # def hash: () -> Integer # # Returns a nicely-formatted string representation of `self`: # # /ab+c/ix.inspect # => "/ab+c/ix" # # Related: Regexp#to_s. # def inspect: () -> String # # With no block given, returns the MatchData object that describes the match, if # any, or `nil` if none; the search begins at the given character `offset` in # `string`: # # /abra/.match('abracadabra') # => # # /abra/.match('abracadabra', 4) # => # # /abra/.match('abracadabra', 8) # => nil # /abra/.match('abracadabra', 800) # => nil # # string = "\u{5d0 5d1 5e8 5d0}cadabra" # /abra/.match(string, 7) #=> # # /abra/.match(string, 8) #=> nil # /abra/.match(string.b, 8) #=> # # # With a block given, calls the block if and only if a match is found; returns # the block's value: # # /abra/.match('abracadabra') {|matchdata| p matchdata } # # => # # /abra/.match('abracadabra', 4) {|matchdata| p matchdata } # # => # # /abra/.match('abracadabra', 8) {|matchdata| p matchdata } # # => nil # /abra/.match('abracadabra', 8) {|marchdata| fail 'Cannot happen' } # # => nil # # Output (from the first two blocks above): # # # # # # # /(.)(.)(.)/.match("abc")[2] # => "b" # /(.)(.)/.match("abc", 1)[2] # => "c" # def match: (interned? str, ?int offset) -> MatchData? | [T] (interned? str, ?int offset) { (MatchData matchdata) -> T } -> T? | (nil, ?int offset) ?{ (MatchData matchdata) -> void } -> nil # # Returns `true` or `false` to indicate whether the regexp is matched or not # without updating $~ and other related variables. If the second parameter is # present, it specifies the position in the string to begin the search. # # /R.../.match?("Ruby") # => true # /R.../.match?("Ruby", 1) # => false # /P.../.match?("Ruby") # => false # $& # => nil # def match?: (interned str, ?int offset) -> bool | (nil, ?int offset) -> false # # Returns a hash representing named captures of `self` (see [Named # Captures](rdoc-ref:Regexp@Named+Captures)): # # * Each key is the name of a named capture. # * Each value is an array of integer indexes for that named capture. # # # Examples: # # /(?.)(?.)/.named_captures # => {"foo"=>[1], "bar"=>[2]} # /(?.)(?.)/.named_captures # => {"foo"=>[1, 2]} # /(.)(.)/.named_captures # => {} # def named_captures: () -> Hash[String, Array[Integer]] # # Returns an array of names of captures (see [Named # Captures](rdoc-ref:Regexp@Named+Captures)): # # /(?.)(?.)(?.)/.names # => ["foo", "bar", "baz"] # /(?.)(?.)/.names # => ["foo"] # /(.)(.)/.names # => [] # def names: () -> Array[String] # # Returns an integer whose bits show the options set in `self`. # # The option bits are: # # Regexp::IGNORECASE # => 1 # Regexp::EXTENDED # => 2 # Regexp::MULTILINE # => 4 # # Examples: # # /foo/.options # => 0 # /foo/i.options # => 1 # /foo/x.options # => 2 # /foo/m.options # => 4 # /foo/mix.options # => 7 # # Note that additional bits may be set in the returned integer; these are # maintained internally in `self`, are ignored if passed to Regexp.new, and may # be ignored by the caller: # # Returns the set of bits corresponding to the options used when creating this # regexp (see Regexp::new for details). Note that additional bits may be set in # the returned options: these are used internally by the regular expression # code. These extra bits are ignored if the options are passed to Regexp::new: # # r = /\xa1\xa2/e # => /\xa1\xa2/ # r.source # => "\\xa1\\xa2" # r.options # => 16 # Regexp.new(r.source, r.options) # => /\xa1\xa2/ # def options: () -> Integer # # Returns the original string of `self`: # # /ab+c/ix.source # => "ab+c" # # Regexp escape sequences are retained: # # /\x20\+/.source # => "\\x20\\+" # # Lexer escape characters are not retained: # # /\//.source # => "/" # def source: () -> String # # Returns a string showing the options and string of `self`: # # r0 = /ab+c/ix # s0 = r0.to_s # => "(?ix-m:ab+c)" # # The returned string may be used as an argument to Regexp.new, or as # interpolated text for a [Regexp # interpolation](rdoc-ref:Regexp@Interpolation+Mode): # # r1 = Regexp.new(s0) # => /(?ix-m:ab+c)/ # r2 = /#{s0}/ # => /(?ix-m:ab+c)/ # # Note that `r1` and `r2` are not equal to `r0` because their original strings # are different: # # r0 == r1 # => false # r0.source # => "ab+c" # r1.source # => "(?ix-m:ab+c)" # # Related: Regexp#inspect. # def to_s: () -> String # # It returns the timeout interval for Regexp matching in second. `nil` means no # default timeout configuration. # # This configuration is per-object. The global configuration set by # Regexp.timeout= is ignored if per-object configuration is set. # # re = Regexp.new("^a*b?a*$", timeout: 1) # re.timeout #=> 1.0 # re =~ "a" * 100000 + "x" #=> regexp match timeout (RuntimeError) # %a{pure} def timeout: () -> Float? # # Equivalent to *`rxp* =~ $_`: # # $_ = "input data" # ~ /at/ # => 7 # def ~: () -> Integer? end