examples/general/SRL/lib/regex/character.rb in rley-0.6.00 vs examples/general/SRL/lib/regex/character.rb in rley-0.6.01
- old
+ new
@@ -1,199 +1,204 @@
# File: character.rb
-require_relative 'atomic_expression' # Access the superclass
+require_relative 'atomic_expression' # Access the superclass
module Regex # This module is used as a namespace
+ # A regular expression that matches a specific character in a given character set
+ class Character < AtomicExpression
+ # Constant with all special 2-characters escape sequences
+ DigramSequences = {
+ "\\a" => 0x7, # alarm
+ "\\n" => 0xA, # newline
+ "\\r" => 0xD, # carriage return
+ "\\t" => 0x9, # tab
+ "\\e" => 0x1B, # escape
+ "\\f" => 0xC, # form feed
+ "\\v" => 0xB, # vertical feed
+ # Single octal digit literals
+ "\\0" => 0,
+ "\\1" => 1,
+ "\\2" => 2,
+ "\\3" => 3,
+ "\\4" => 4,
+ "\\5" => 5,
+ "\\6" => 6,
+ "\\7" => 7
+ }.freeze
-# A regular expression that matches a specific character in a given character set
-class Character < AtomicExpression
- # Constant with all special 2-characters escape sequences
- DigramSequences = {
- "\\a" => 0x7, # alarm
- "\\n" => 0xA, # newline
- "\\r" => 0xD, # carriage return
- "\\t" => 0x9, # tab
- "\\e" => 0x1B, # escape
- "\\f" => 0xC, # form feed
- "\\v" => 0xB, # vertical feed
- # Single octal digit literals
- "\\0" => 0,
- "\\1" => 1,
- "\\2" => 2,
- "\\3" => 3,
- "\\4" => 4,
- "\\5" => 5,
- "\\6" => 6,
- "\\7" => 7
- }
-
- MetaChars = '\^$+?.'
-
- # The integer value that uniquely identifies the character.
- attr_reader(:codepoint)
-
- # The initial text representation of the character (if any).
- attr_reader(:lexeme)
-
- # Constructor.
- # [aValue] Initialize the character with a either a String literal or a codepoint value.
- # Examples:
- # Initializing with codepoint value...
- # RegAn::Character.new(0x3a3) # Represents: Σ (Unicode GREEK CAPITAL LETTER SIGMA)
- # RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
- #
- # Initializing with a single character string
- # RegAn::Character.new(?\u03a3) # Also represents: Σ
- # RegAn::Character.new('Σ') # Obviously, represents a Σ
- #
- # Initializing with an escape sequence string
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC)
- # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex)
- # Any other escaped character will be treated as a literal character
- # RegAn::Character.new('\n') # Represents a newline
- # RegAn::Character.new('\u03a3') # Represents a Σ
- def initialize(aValue)
+ MetaChars = '\^$+?.'.freeze
- case aValue
- when String
- if aValue.size == 1
- # Literal single character case...
- @codepoint = self.class.char2codepoint(aValue)
- else
- # Should be an escape sequence...
- @codepoint = self.class.esc2codepoint(aValue)
- end
- @lexeme = aValue
-
- when Integer
- @codepoint = aValue
- else
- raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
- end
- end
-
-public
- # Convertion method that returns a character given a codepoint (integer) value.
- # Example:
- # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (The Unicode GREEK CAPITAL LETTER SIGMA)
- def self.codepoint2char(aCodepoint)
- return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
- end
-
- # Convertion method that returns the codepoint for the given single character.
- # Example:
- # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
- def self.char2codepoint(aChar)
- return aChar.ord()
- end
-
- # Convertion method that returns the codepoint for the given escape sequence (a String).
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
- # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex)
- # Any other escaped character will be treated as a literal character
- # Example:
- # RegAn::Character::esc2codepoint('\n') # Returns: 0xd
- def self.esc2codepoint(anEscapeSequence)
- raise StandardError, "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)." unless anEscapeSequence[0] == "\\"
- result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
-
- return result
- end
-
- # Return the character as a String object
- def char()
- self.class.codepoint2char(@codepoint)
- end
-
- # Returns true iff this Character and parameter 'another' represent the same character.
- # [another] any Object. The way the equality is tested depends on the another's class
- # Example:
- # newOne = Character.new(?\u03a3)
- # newOne == newOne # true. Identity
- # newOne == Character.new(?\u03a3) # true. Both have same codepoint
- # newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
- # newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
- # Will test equality with any Object that knows the to_s method
- def ==(another)
- result = case another
- when Character
- self.to_str == another.to_str
-
- when Integer
- self.codepoint == another
-
- when String
- (another.size > 1) ? false : self.to_str == another
-
- else
- # Unknown type: try with a convertion
- self == another.to_s() # Recursive call
- end
-
- return result
- end
-
- # Return a plain English description of the character
- def explain()
- return "the character '#{to_str()}'"
- end
-
- protected
-
- # Conversion method re-definition.
- # Purpose: Return the String representation of the expression.
- # If the Character was initially from a text (the lexeme), then the lexeme is returned back.
- # Otherwise the character corresponding to the codepoint is returned.
- def text_repr()
- if lexeme.nil?
- result = char()
- else
- result = lexeme.dup()
- end
-
- return result
- end
-
-private
- # Convertion method that returns a codepoint for the given two characters (digram) escape sequence.
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
- # Any other escape sequence will return the codepoint of the escaped character.
- # [aDigram] A sequence of two characters that starts with a backslash.
- def self.digram2codepoint(aDigram)
- # Check that the digram is a special escape sequence
- result = DigramSequences.fetch(aDigram, nil)
-
- # If it not a special sequence, then escaped character is considered literally (the backslash is 'dummy')
- result = char2codepoint(aDigram[-1]) if result.nil?
- return result
- end
+ # The integer value that uniquely identifies the character.
+ attr_reader(:codepoint)
- # Convertion method that returns a codepoint for the given complex escape sequence.
- # [anEscapeSequence] A String with the format:
- # \uXXXX where XXXX is a 4 hex digits integer value,
- # \u{X...} X 1 or more hex digits
- # \ooo (1..3 octal digits literal)
- # \xXX (1..2 hex digits literal)
- def self.esc_number2codepoint(anEscapeSequence)
- # Next line requires Ruby >= 1.9
- unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
- raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
- else
- #shorterSeq = anEscapeSequence[1..-1] # Remove the backslash
-
- # Octal literal case?
- return octal.oct() if octal # shorterSeq =~ /[0-7]{1,3}/
-
- # Extract the hexadecimal number
- hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
- return hexliteral.hex()
- end
- end
+ # The initial text representation of the character (if any).
+ attr_reader(:lexeme)
-end # class
+ # Constructor.
+ # [aValue] Initialize the character with a either a String literal or a
+ # codepoint value.
+ # Examples:
+ # Initializing with codepoint value...
+ # RegAn::Character.new(0x3a3) # Represents: Σ
+ # (Unicode GREEK CAPITAL LETTER SIGMA)
+ # RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
+ #
+ # Initializing with a single character string
+ # RegAn::Character.new(?\u03a3) # Also represents: Σ
+ # RegAn::Character.new('Σ') # Obviously, represents a Σ
+ #
+ # Initializing with an escape sequence string
+ # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+ # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
+ # \f (form feed, 0xC)
+ # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
+ # \xXX (hex)
+ # Any other escaped character will be treated as a literal character
+ # RegAn::Character.new('\n') # Represents a newline
+ # RegAn::Character.new('\u03a3') # Represents a Σ
+ def initialize(aValue)
+ case aValue
+ when String
+ if aValue.size == 1
+ # Literal single character case...
+ @codepoint = self.class.char2codepoint(aValue)
+ else
+ # Should be an escape sequence...
+ @codepoint = self.class.esc2codepoint(aValue)
+ end
+ @lexeme = aValue
+ when Integer
+ @codepoint = aValue
+ else
+ raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
+ end
+ end
+
+ # Convertion method that returns a character given a codepoint (integer) value.
+ # Example:
+ # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
+ # The Unicode GREEK CAPITAL LETTER SIGMA)
+ def self.codepoint2char(aCodepoint)
+ return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
+ end
+
+ # Convertion method that returns the codepoint for the given single character.
+ # Example:
+ # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
+ def self.char2codepoint(aChar)
+ return aChar.ord
+ end
+
+ # Convertion method that returns the codepoint for the given escape
+ # sequence (a String).
+ # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+ # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed,
+ # 0xC), \v (vertical feed, 0xB)
+ # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
+ # \xXX (hex)
+ # Any other escaped character will be treated as a literal character
+ # Example:
+ # RegAn::Character::esc2codepoint('\n') # Returns: 0xd
+ def self.esc2codepoint(anEscapeSequence)
+ msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
+ raise StandardError, msg unless anEscapeSequence[0] == "\\"
+ result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
+
+ return result
+ end
+
+ # Return the character as a String object
+ def char()
+ self.class.codepoint2char(@codepoint)
+ end
+
+ # Returns true iff this Character and parameter 'another' represent the same character.
+ # [another] any Object. The way the equality is tested depends on the another's class
+ # Example:
+ # newOne = Character.new(?\u03a3)
+ # newOne == newOne # true. Identity
+ # newOne == Character.new(?\u03a3) # true. Both have same codepoint
+ # newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
+ # newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
+ # Will test equality with any Object that knows the to_s method
+ def ==(other)
+ result = case other
+ when Character
+ self.to_str == other.to_str
+
+ when Integer
+ self.codepoint == other
+
+ when String
+ other.size > 1 ? false : to_str == other
+
+ else
+ # Unknown type: try with a convertion
+ self == other.to_s # Recursive call
+ end
+
+ return result
+ end
+
+ # Return a plain English description of the character
+ def explain()
+ return "the character '#{to_str}'"
+ end
+
+ protected
+
+ # Conversion method re-definition.
+ # Purpose: Return the String representation of the expression.
+ # If the Character was initially from a text (the lexeme), then the lexeme
+ # is returned back.
+ # Otherwise the character corresponding to the codepoint is returned.
+ def text_repr()
+ return char if lexeme.nil?
+ return lexeme.dup
+ end
+
+ # Convertion method that returns a codepoint for the given two characters
+ # (digram) escape sequence.
+ # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+ # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
+ # \f (form feed, 0xC), \v (vertical feed, 0xB)
+ # Any other escape sequence will return the codepoint of the escaped
+ # character.
+ # [aDigram] A sequence of two characters that starts with a backslash.
+ def self.digram2codepoint(aDigram)
+ # Check that the digram is a special escape sequence
+ result = DigramSequences.fetch(aDigram, nil)
+
+ # If it not a special sequence, then escaped character is
+ # considered literally (the backslash is 'dummy')
+ result = char2codepoint(aDigram[-1]) if result.nil?
+ return result
+ end
+
+ private_class_method :digram2codepoint
+
+ # Convertion method that returns a codepoint for the given complex
+ # escape sequence.
+ # [anEscapeSequence] A String with the format:
+ # \uXXXX where XXXX is a 4 hex digits integer value,
+ # \u{X...} X 1 or more hex digits
+ # \ooo (1..3 octal digits literal)
+ # \xXX (1..2 hex digits literal)
+ def self.esc_number2codepoint(anEscapeSequence)
+ unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
+ raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
+ else
+ # Octal literal case?
+ return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
+
+ # Extract the hexadecimal number
+ hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
+ return hexliteral.hex
+ end
+ end
+
+ private_class_method :esc_number2codepoint
+ end # class
end # module
-# End of file
\ No newline at end of file
+# End of file