character.rb in rley-0.6.01

- old
+ new
@@ -1,199 +1,204 @@
 # File: character.rb
 
-require_relative 'atomic_expression'	# Access the superclass
+require_relative 'atomic_expression' # Access the superclass
 
 module Regex # This module is used as a namespace
+  # A regular expression that matches a specific character in a given character set
+  class Character < AtomicExpression
+    # Constant with all special 2-characters escape sequences
+    DigramSequences = {
+      "\\a" => 0x7, # alarm
+      "\\n" => 0xA, # newline
+      "\\r" => 0xD, # carriage return
+      "\\t" => 0x9, # tab
+      "\\e" => 0x1B, # escape
+      "\\f" => 0xC, # form feed
+      "\\v" => 0xB, # vertical feed
+      # Single octal digit literals
+      "\\0" => 0,
+      "\\1" => 1,
+      "\\2" => 2,
+      "\\3" => 3,
+      "\\4" => 4,
+      "\\5" => 5, 
+      "\\6" => 6, 
+      "\\7" => 7  
+    }.freeze
 
-# A regular expression that matches a specific character in a given character set
-class Character < AtomicExpression
-	# Constant with all special 2-characters escape sequences
-	DigramSequences = {
-		"\\a" => 0x7, # alarm
-		"\\n" => 0xA, # newline
-		"\\r" => 0xD, # carriage return
-		"\\t" => 0x9, # tab	
-		"\\e" => 0x1B, # escape
-		"\\f" => 0xC, # form feed
-		"\\v" => 0xB, # vertical feed
-		# Single octal digit literals
-		"\\0" => 0,
-		"\\1" => 1,
-		"\\2" => 2,
-		"\\3" => 3,
-		"\\4" => 4,
-		"\\5" => 5,	
-		"\\6" => 6,	
-		"\\7" => 7			
-	}
-  
-  MetaChars = '\^$+?.'
-	
-	# The integer value that uniquely identifies the character. 
-	attr_reader(:codepoint)
-	
-	# The initial text representation of the character (if any).
-	attr_reader(:lexeme)
-	
-	# Constructor.
-	# [aValue] Initialize the character with a either a String literal or a codepoint value.
-	# Examples:
-	# Initializing with codepoint value...
-	# RegAn::Character.new(0x3a3)	# Represents: Σ (Unicode GREEK CAPITAL LETTER SIGMA)
-	# RegAn::Character.new(931)		# Also represents: Σ (931 dec == 3a3 hex)
-	#
-	# Initializing with a single character string
-	# RegAn::Character.new(?\u03a3) # Also represents: Σ
-	# RegAn::Character.new('Σ')		# Obviously, represents a Σ
-	#
-	# Initializing with an escape sequence string
-	# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
-	#	\r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC)
-	#	\uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)	\xXX (hex)
-	# Any other escaped character will be treated as a literal character
-	# RegAn::Character.new('\n')		# Represents a newline
-	# RegAn::Character.new('\u03a3')	# Represents a Σ
-	def initialize(aValue)
+    MetaChars = '\^$+?.'.freeze
 
-		case aValue
-			when String
-				if aValue.size == 1
-					# Literal single character case...
-					@codepoint = self.class.char2codepoint(aValue)
-				else
-					# Should be an escape sequence...
-					@codepoint = self.class.esc2codepoint(aValue)
-				end
-				@lexeme = aValue
-				
-			when Integer
-				@codepoint = aValue
-			else
-				raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
-		end
-	end
-	
-public
-	# Convertion method that returns a character given a codepoint (integer) value.
-	# Example:
-	# RegAn::Character::codepoint2char(0x3a3)	# Returns: Σ (The Unicode GREEK CAPITAL LETTER SIGMA)
-	def self.codepoint2char(aCodepoint)
-		return [aCodepoint].pack('U')	# Remark: chr() fails with codepoints > 256
-	end
-	
-	# Convertion method that returns the codepoint for the given single character.
-	# Example:
-	# RegAn::Character::char2codepoint('Σ')	# Returns: 0x3a3	
-	def self.char2codepoint(aChar)
-		return aChar.ord()		
-	end
-	
-	# Convertion method that returns the codepoint for the given escape sequence (a String).
-	# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
-	#	\r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
-	#	\uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)	\xXX (hex)
-	# Any other escaped character will be treated as a literal character	
-	# Example:
-	# RegAn::Character::esc2codepoint('\n')	# Returns: 0xd	
-	def self.esc2codepoint(anEscapeSequence)
-		raise StandardError, "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)." unless anEscapeSequence[0] == "\\"
-		result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
-		
-		return result
-	end
-	
-	# Return the character as a String object
-	def char()
-		self.class.codepoint2char(@codepoint)
-	end
-	
-	# Returns true iff this Character and parameter 'another' represent the same character.
-	# [another] any Object. The way the equality is tested depends on the another's class
-	# Example:
-	# newOne = Character.new(?\u03a3)
-	# newOne == newOne	# true. Identity
-	# newOne == Character.new(?\u03a3)	# true. Both have same codepoint
-	# newOne == ?\u03a3	# true. The single character String match exactly the char attribute.
-	# newOne == 0x03a3	# true. The Integer is compared to the codepoint value.
-	# Will test equality with any Object that knows the to_s method
-	def ==(another)
-		result = case another
-			when Character
-				self.to_str == another.to_str
-				
-			when Integer
-				self.codepoint == another
-				
-			when String
-				(another.size > 1) ? false : self.to_str == another
-				
-			else
-				# Unknown type: try with a convertion
-				self == another.to_s()	# Recursive call
-		end
-		
-		return result
-	end
-	
-	# Return a plain English description of the character
-	def explain()
-		return "the character '#{to_str()}'"
-	end
-  
-  protected
-  
-  # Conversion method re-definition.
-	# Purpose: Return the String representation of the expression.
-	# If the Character was initially from a text (the lexeme), then the lexeme is returned back.
-	# Otherwise the character corresponding to the codepoint is returned.
-	def text_repr()
-		if lexeme.nil?
-			result = char()
-		else
-			result = lexeme.dup()
-		end
-		
-		return result
-	end
-	
-private
-	# Convertion method that returns a codepoint for the given two characters (digram) escape sequence.
-	# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
-	#	\r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
-	# Any other escape sequence will return the codepoint of the escaped character.
-	# [aDigram]	A sequence of two characters that starts with a backslash.
-	def self.digram2codepoint(aDigram)
-		# Check that the digram is a special escape sequence
-		result = DigramSequences.fetch(aDigram, nil)
-		
-		# If it not a special sequence, then escaped character is considered literally (the backslash is 'dummy')
-		result = char2codepoint(aDigram[-1]) if result.nil?
-		return result
-	end
+    # The integer value that uniquely identifies the character.
+    attr_reader(:codepoint)
 
-	# Convertion method that returns a codepoint for the given complex escape sequence.	
-	# [anEscapeSequence] A String with the format:
-	# \uXXXX where XXXX is a 4 hex digits integer value,
-	# \u{X...} X 1 or more hex digits
-	# \ooo (1..3 octal digits literal)
-	# \xXX (1..2 hex digits literal)
-	def self.esc_number2codepoint(anEscapeSequence)
-		# Next line requires Ruby >= 1.9
-		unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
-			raise StandardError, "Unsupported escape sequence #{anEscapeSequence}." 
-		else
-			#shorterSeq = anEscapeSequence[1..-1]	# Remove the backslash
-		
-		# Octal literal case?
-			return octal.oct() if octal # shorterSeq =~ /[0-7]{1,3}/
-		
-			# Extract the hexadecimal number
-			hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
-			return hexliteral.hex()
-		end
-	end
+    # The initial text representation of the character (if any).
+    attr_reader(:lexeme)
 
-end # class
+    # Constructor.
+    # [aValue] Initialize the character with a either a String literal or a 
+    # codepoint value.
+    # Examples:
+    # Initializing with codepoint value...
+    # RegAn::Character.new(0x3a3) # Represents: Σ 
+    # (Unicode GREEK CAPITAL LETTER SIGMA)
+    # RegAn::Character.new(931)   # Also represents: Σ (931 dec == 3a3 hex)
+    #
+    # Initializing with a single character string
+    # RegAn::Character.new(?\u03a3) # Also represents: Σ
+    # RegAn::Character.new('Σ')   # Obviously, represents a Σ
+    #
+    # Initializing with an escape sequence string
+    # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+    # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), 
+    # \f (form feed, 0xC)
+    # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
+    # \xXX (hex)
+    # Any other escaped character will be treated as a literal character
+    # RegAn::Character.new('\n')    # Represents a newline
+    # RegAn::Character.new('\u03a3')  # Represents a Σ
+    def initialize(aValue)
+      case aValue
+        when String
+          if aValue.size == 1
+            # Literal single character case...
+            @codepoint = self.class.char2codepoint(aValue)
+          else
+            # Should be an escape sequence...
+            @codepoint = self.class.esc2codepoint(aValue)
+          end
+          @lexeme = aValue
 
+        when Integer
+          @codepoint = aValue
+        else
+          raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
+      end
+    end
+
+    # Convertion method that returns a character given a codepoint (integer) value.
+    # Example:
+    # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
+    # The Unicode GREEK CAPITAL LETTER SIGMA)
+    def self.codepoint2char(aCodepoint)
+      return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
+    end
+
+    # Convertion method that returns the codepoint for the given single character.
+    # Example:
+    # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
+    def self.char2codepoint(aChar)
+      return aChar.ord
+    end
+
+    # Convertion method that returns the codepoint for the given escape 
+    # sequence (a String).
+    # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+    # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 
+    # 0xC), \v (vertical feed, 0xB)
+    # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
+    # \xXX (hex)
+    # Any other escaped character will be treated as a literal character
+    # Example:
+    # RegAn::Character::esc2codepoint('\n') # Returns: 0xd
+    def self.esc2codepoint(anEscapeSequence)
+      msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
+      raise StandardError, msg unless anEscapeSequence[0] == "\\"
+      result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
+
+      return result
+    end
+
+    # Return the character as a String object
+    def char()
+      self.class.codepoint2char(@codepoint)
+    end
+
+    # Returns true iff this Character and parameter 'another' represent the same character.
+    # [another] any Object. The way the equality is tested depends on the another's class
+    # Example:
+    # newOne = Character.new(?\u03a3)
+    # newOne == newOne  # true. Identity
+    # newOne == Character.new(?\u03a3)  # true. Both have same codepoint
+    # newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
+    # newOne == 0x03a3  # true. The Integer is compared to the codepoint value.
+    # Will test equality with any Object that knows the to_s method
+    def ==(other)
+      result = case other
+        when Character
+          self.to_str == other.to_str
+
+        when Integer
+          self.codepoint == other
+
+        when String
+          other.size > 1 ? false : to_str == other
+
+        else
+          # Unknown type: try with a convertion
+          self == other.to_s # Recursive call
+      end
+
+      return result
+    end
+
+    # Return a plain English description of the character
+    def explain()
+      return "the character '#{to_str}'"
+    end
+
+    protected
+
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the expression.
+    # If the Character was initially from a text (the lexeme), then the lexeme
+    # is returned back.
+    # Otherwise the character corresponding to the codepoint is returned.
+    def text_repr()
+      return char if lexeme.nil?
+      return lexeme.dup
+    end
+
+    # Convertion method that returns a codepoint for the given two characters
+    # (digram) escape sequence.
+    # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
+    # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
+    # \f (form feed, 0xC), \v (vertical feed, 0xB)
+    # Any other escape sequence will return the codepoint of the escaped
+    # character.
+    # [aDigram] A sequence of two characters that starts with a backslash.
+    def self.digram2codepoint(aDigram)
+      # Check that the digram is a special escape sequence
+      result = DigramSequences.fetch(aDigram, nil)
+
+      # If it not a special sequence, then escaped character is
+      # considered literally (the backslash is 'dummy')
+      result = char2codepoint(aDigram[-1]) if result.nil?
+      return result
+    end
+
+    private_class_method :digram2codepoint
+
+    # Convertion method that returns a codepoint for the given complex
+    # escape sequence.
+    # [anEscapeSequence] A String with the format:
+    # \uXXXX where XXXX is a 4 hex digits integer value,
+    # \u{X...} X 1 or more hex digits
+    # \ooo (1..3 octal digits literal)
+    # \xXX (1..2 hex digits literal)
+    def self.esc_number2codepoint(anEscapeSequence)
+      unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
+        raise StandardError, "Unsupported escape sequence #{anEscapeSequence}." 
+      else
+      # Octal literal case?
+        return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
+      
+        # Extract the hexadecimal number
+        hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
+        return hexliteral.hex
+      end
+    end
+
+    private_class_method :esc_number2codepoint
+  end # class
 end # module
 
-# End of file
\ No newline at end of file
+# End of file