examples/general/SRL/lib/regex/character.rb in rley-0.6.00 vs examples/general/SRL/lib/regex/character.rb in rley-0.6.01

- old
+ new

@@ -1,199 +1,204 @@ # File: character.rb -require_relative 'atomic_expression' # Access the superclass +require_relative 'atomic_expression' # Access the superclass module Regex # This module is used as a namespace + # A regular expression that matches a specific character in a given character set + class Character < AtomicExpression + # Constant with all special 2-characters escape sequences + DigramSequences = { + "\\a" => 0x7, # alarm + "\\n" => 0xA, # newline + "\\r" => 0xD, # carriage return + "\\t" => 0x9, # tab + "\\e" => 0x1B, # escape + "\\f" => 0xC, # form feed + "\\v" => 0xB, # vertical feed + # Single octal digit literals + "\\0" => 0, + "\\1" => 1, + "\\2" => 2, + "\\3" => 3, + "\\4" => 4, + "\\5" => 5, + "\\6" => 6, + "\\7" => 7 + }.freeze -# A regular expression that matches a specific character in a given character set -class Character < AtomicExpression - # Constant with all special 2-characters escape sequences - DigramSequences = { - "\\a" => 0x7, # alarm - "\\n" => 0xA, # newline - "\\r" => 0xD, # carriage return - "\\t" => 0x9, # tab - "\\e" => 0x1B, # escape - "\\f" => 0xC, # form feed - "\\v" => 0xB, # vertical feed - # Single octal digit literals - "\\0" => 0, - "\\1" => 1, - "\\2" => 2, - "\\3" => 3, - "\\4" => 4, - "\\5" => 5, - "\\6" => 6, - "\\7" => 7 - } - - MetaChars = '\^$+?.' - - # The integer value that uniquely identifies the character. - attr_reader(:codepoint) - - # The initial text representation of the character (if any). - attr_reader(:lexeme) - - # Constructor. - # [aValue] Initialize the character with a either a String literal or a codepoint value. - # Examples: - # Initializing with codepoint value... - # RegAn::Character.new(0x3a3) # Represents: Σ (Unicode GREEK CAPITAL LETTER SIGMA) - # RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex) - # - # Initializing with a single character string - # RegAn::Character.new(?\u03a3) # Also represents: Σ - # RegAn::Character.new('Σ') # Obviously, represents a Σ - # - # Initializing with an escape sequence string - # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA), - # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC) - # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex) - # Any other escaped character will be treated as a literal character - # RegAn::Character.new('\n') # Represents a newline - # RegAn::Character.new('\u03a3') # Represents a Σ - def initialize(aValue) + MetaChars = '\^$+?.'.freeze - case aValue - when String - if aValue.size == 1 - # Literal single character case... - @codepoint = self.class.char2codepoint(aValue) - else - # Should be an escape sequence... - @codepoint = self.class.esc2codepoint(aValue) - end - @lexeme = aValue - - when Integer - @codepoint = aValue - else - raise StandardError, "Cannot initialize a Character with a '#{aValue}'." - end - end - -public - # Convertion method that returns a character given a codepoint (integer) value. - # Example: - # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (The Unicode GREEK CAPITAL LETTER SIGMA) - def self.codepoint2char(aCodepoint) - return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256 - end - - # Convertion method that returns the codepoint for the given single character. - # Example: - # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3 - def self.char2codepoint(aChar) - return aChar.ord() - end - - # Convertion method that returns the codepoint for the given escape sequence (a String). - # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA), - # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB) - # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex) - # Any other escaped character will be treated as a literal character - # Example: - # RegAn::Character::esc2codepoint('\n') # Returns: 0xd - def self.esc2codepoint(anEscapeSequence) - raise StandardError, "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)." unless anEscapeSequence[0] == "\\" - result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence) - - return result - end - - # Return the character as a String object - def char() - self.class.codepoint2char(@codepoint) - end - - # Returns true iff this Character and parameter 'another' represent the same character. - # [another] any Object. The way the equality is tested depends on the another's class - # Example: - # newOne = Character.new(?\u03a3) - # newOne == newOne # true. Identity - # newOne == Character.new(?\u03a3) # true. Both have same codepoint - # newOne == ?\u03a3 # true. The single character String match exactly the char attribute. - # newOne == 0x03a3 # true. The Integer is compared to the codepoint value. - # Will test equality with any Object that knows the to_s method - def ==(another) - result = case another - when Character - self.to_str == another.to_str - - when Integer - self.codepoint == another - - when String - (another.size > 1) ? false : self.to_str == another - - else - # Unknown type: try with a convertion - self == another.to_s() # Recursive call - end - - return result - end - - # Return a plain English description of the character - def explain() - return "the character '#{to_str()}'" - end - - protected - - # Conversion method re-definition. - # Purpose: Return the String representation of the expression. - # If the Character was initially from a text (the lexeme), then the lexeme is returned back. - # Otherwise the character corresponding to the codepoint is returned. - def text_repr() - if lexeme.nil? - result = char() - else - result = lexeme.dup() - end - - return result - end - -private - # Convertion method that returns a codepoint for the given two characters (digram) escape sequence. - # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA), - # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB) - # Any other escape sequence will return the codepoint of the escaped character. - # [aDigram] A sequence of two characters that starts with a backslash. - def self.digram2codepoint(aDigram) - # Check that the digram is a special escape sequence - result = DigramSequences.fetch(aDigram, nil) - - # If it not a special sequence, then escaped character is considered literally (the backslash is 'dummy') - result = char2codepoint(aDigram[-1]) if result.nil? - return result - end + # The integer value that uniquely identifies the character. + attr_reader(:codepoint) - # Convertion method that returns a codepoint for the given complex escape sequence. - # [anEscapeSequence] A String with the format: - # \uXXXX where XXXX is a 4 hex digits integer value, - # \u{X...} X 1 or more hex digits - # \ooo (1..3 octal digits literal) - # \xXX (1..2 hex digits literal) - def self.esc_number2codepoint(anEscapeSequence) - # Next line requires Ruby >= 1.9 - unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence - raise StandardError, "Unsupported escape sequence #{anEscapeSequence}." - else - #shorterSeq = anEscapeSequence[1..-1] # Remove the backslash - - # Octal literal case? - return octal.oct() if octal # shorterSeq =~ /[0-7]{1,3}/ - - # Extract the hexadecimal number - hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1') - return hexliteral.hex() - end - end + # The initial text representation of the character (if any). + attr_reader(:lexeme) -end # class + # Constructor. + # [aValue] Initialize the character with a either a String literal or a + # codepoint value. + # Examples: + # Initializing with codepoint value... + # RegAn::Character.new(0x3a3) # Represents: Σ + # (Unicode GREEK CAPITAL LETTER SIGMA) + # RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex) + # + # Initializing with a single character string + # RegAn::Character.new(?\u03a3) # Also represents: Σ + # RegAn::Character.new('Σ') # Obviously, represents a Σ + # + # Initializing with an escape sequence string + # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA), + # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), + # \f (form feed, 0xC) + # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) + # \xXX (hex) + # Any other escaped character will be treated as a literal character + # RegAn::Character.new('\n') # Represents a newline + # RegAn::Character.new('\u03a3') # Represents a Σ + def initialize(aValue) + case aValue + when String + if aValue.size == 1 + # Literal single character case... + @codepoint = self.class.char2codepoint(aValue) + else + # Should be an escape sequence... + @codepoint = self.class.esc2codepoint(aValue) + end + @lexeme = aValue + when Integer + @codepoint = aValue + else + raise StandardError, "Cannot initialize a Character with a '#{aValue}'." + end + end + + # Convertion method that returns a character given a codepoint (integer) value. + # Example: + # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ ( + # The Unicode GREEK CAPITAL LETTER SIGMA) + def self.codepoint2char(aCodepoint) + return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256 + end + + # Convertion method that returns the codepoint for the given single character. + # Example: + # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3 + def self.char2codepoint(aChar) + return aChar.ord + end + + # Convertion method that returns the codepoint for the given escape + # sequence (a String). + # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA), + # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, + # 0xC), \v (vertical feed, 0xB) + # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) + # \xXX (hex) + # Any other escaped character will be treated as a literal character + # Example: + # RegAn::Character::esc2codepoint('\n') # Returns: 0xd + def self.esc2codepoint(anEscapeSequence) + msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)." + raise StandardError, msg unless anEscapeSequence[0] == "\\" + result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence) + + return result + end + + # Return the character as a String object + def char() + self.class.codepoint2char(@codepoint) + end + + # Returns true iff this Character and parameter 'another' represent the same character. + # [another] any Object. The way the equality is tested depends on the another's class + # Example: + # newOne = Character.new(?\u03a3) + # newOne == newOne # true. Identity + # newOne == Character.new(?\u03a3) # true. Both have same codepoint + # newOne == ?\u03a3 # true. The single character String match exactly the char attribute. + # newOne == 0x03a3 # true. The Integer is compared to the codepoint value. + # Will test equality with any Object that knows the to_s method + def ==(other) + result = case other + when Character + self.to_str == other.to_str + + when Integer + self.codepoint == other + + when String + other.size > 1 ? false : to_str == other + + else + # Unknown type: try with a convertion + self == other.to_s # Recursive call + end + + return result + end + + # Return a plain English description of the character + def explain() + return "the character '#{to_str}'" + end + + protected + + # Conversion method re-definition. + # Purpose: Return the String representation of the expression. + # If the Character was initially from a text (the lexeme), then the lexeme + # is returned back. + # Otherwise the character corresponding to the codepoint is returned. + def text_repr() + return char if lexeme.nil? + return lexeme.dup + end + + # Convertion method that returns a codepoint for the given two characters + # (digram) escape sequence. + # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA), + # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), + # \f (form feed, 0xC), \v (vertical feed, 0xB) + # Any other escape sequence will return the codepoint of the escaped + # character. + # [aDigram] A sequence of two characters that starts with a backslash. + def self.digram2codepoint(aDigram) + # Check that the digram is a special escape sequence + result = DigramSequences.fetch(aDigram, nil) + + # If it not a special sequence, then escaped character is + # considered literally (the backslash is 'dummy') + result = char2codepoint(aDigram[-1]) if result.nil? + return result + end + + private_class_method :digram2codepoint + + # Convertion method that returns a codepoint for the given complex + # escape sequence. + # [anEscapeSequence] A String with the format: + # \uXXXX where XXXX is a 4 hex digits integer value, + # \u{X...} X 1 or more hex digits + # \ooo (1..3 octal digits literal) + # \xXX (1..2 hex digits literal) + def self.esc_number2codepoint(anEscapeSequence) + unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence + raise StandardError, "Unsupported escape sequence #{anEscapeSequence}." + else + # Octal literal case? + return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/ + + # Extract the hexadecimal number + hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1') + return hexliteral.hex + end + end + + private_class_method :esc_number2codepoint + end # class end # module -# End of file \ No newline at end of file +# End of file