require 'strscan' # Parser for ASCIIMath expressions. # # The syntax for ASCIIMath in EBNF style notation is # # expr = ( simp ( fraction | sub | super ) )+ # simp = constant | paren_expr | unary_expr | binary_expr | text # fraction = '/' simp # super = '^' simp # sub = '_' simp super? # paren_expr = lparen expr rparen # lparen = '(' | '[' | '{' | '(:' | '{:' # rparen = ')' | ']' | '}' | ':)' | ':}' # unary_expr = unary_op simp # unary_op = 'sqrt' | 'text' # binary_expr = binary_op simp simp # binary_op = 'frac' | 'root' | 'stackrel' # text = '"' [^"]* '"' # constant = number | symbol | identifier # number = '-'? [0-9]+ ( '.' [0-9]+ )? # symbol = /* any string in the symbol table */ # identifier = [A-z] # # ASCIIMath is parsed left to right without any form of operator precedence. # When parsing the 'constant' the parser will try to find the longest matching string in the symbol # table starting at the current position of the parser. If no matching string can be found the # character at the current position of the parser is interpreted as an identifier instead. module AsciiMath # Internal: Splits an ASCIIMath expression into a sequence of tokens. # Each token is represented as a Hash containing the keys :value and :type. # The :value key is used to store the text associated with each token. # The :type key indicates the semantics of the token. The value for :type will be one # of the following symbols: # # - :identifier a symbolic name or a bit of text without any further semantics # - :text a bit of arbitrary text # - :number a number # - :operator a mathematical operator symbol # - :unary a unary operator (e.g., sqrt, text, ...) # - :font a unary font command (e.g., bb, cc, ...) # - :infix an infix operator (e.g, /, _, ^, ...) # - :binary a binary operator (e.g., frac, root, ...) # - :accent an accent character # - :eof indicates no more tokens are available # # Each token type may also have an :underover modifier. When present and set to true # sub- and superscript expressions associated with the token will be rendered as # under- and overscriptabove and below rather than as sub- or superscript. # # :accent tokens additionally have a :postion value which is set to either :over or :under. # This determines if the accent should be rendered over or under the expression to which # it applies. # class Tokenizer WHITESPACE = /^\s+/ NUMBER = /-?[0-9]+(?:\.[0-9]+)?/ TEXT = /"[^"]+"/ # Public: Initializes an ASCIIMath tokenizer. # # string - The ASCIIMath expression to tokenize # symbols - The symbol table to use while tokenizing def initialize(string, symbols) @string = StringScanner.new(string) @symbols = symbols lookahead = @symbols.keys.map { |k| k.length }.max @symbol_regexp = /([^\s0-9]{1,#{lookahead}})/ @push_back = nil end # Public: Read the next token from the ASCIIMath expression and move the tokenizer # ahead by one token. # # Returns the next token as a Hash def next_token if @push_back t = @push_back @push_back = nil return t end @string.scan(WHITESPACE) return {:value => nil, :type => :eof} if @string.eos? case @string.peek(1) when '"' read_text when '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' read_number() || read_symbol else read_symbol() end end # Public: Pushes the given token back to the tokenizer. A subsequent call to next_token # will return the given token rather than generating a new one. At most one # token can be pushed back. # # token - The token to push back def push_back(token) @push_back = token unless token[:type] == :eof end private # Private: Reads a text token from the input string # # Returns the text token or nil if a text token could not be matched at # the current position def read_text read_value(TEXT) do |text| {:value => text[1..-2], :type => :text} end end # Private: Reads a number token from the input string # # Returns the number token or nil if a number token could not be matched at # the current position def read_number read_value(NUMBER) do |number| {:value => number, :type => :number} end end if String.method_defined?(:bytesize) def bytesize(s) s.bytesize end else def bytesize(s) s.length end end # Private: Reads a symbol token from the input string. This method first creates # a String from the input String starting from the current position with a length # that matches that of the longest key in the symbol table. It then looks up that # substring in the symbol table. If the substring is present in the symbol table, the # associated value is returned and the position is moved ahead by the length of the # substring. Otherwise this method chops one character off the end of the substring # and repeats the symbol lookup. This continues until a single character is left. # If that character can still not be found in the symbol table, then an identifier # token is returned whose value is the remaining single character string. # # Returns the token that was read or nil if a token could not be matched at # the current position def read_symbol position = @string.pos read_value(@symbol_regexp) do |s| until s.length == 1 || @symbols.include?(s) s.chop! end @string.pos = position + bytesize(s) @symbols[s] || {:value => s, :type => :identifier} end end # Private: Reads a String from the input String that matches the given RegExp # # regexp - a RegExp that will be used to match the token # block - if a block is provided the matched token will be passed to the block # # Returns the matched String or the value returned by the block if one was given def read_value(regexp) s = @string.scan(regexp) if s yield s else s end end if String.respond_to?(:byte_size) def byte_size(s) s.byte_size end end end class Parser SYMBOLS = { # Operation symbols '+' => {:value => '+', :type => :operator}, '-' => {:value => '−', :type => :operator}, '*' => {:value => '⋅', :type => :operator}, '**' => {:value => '⋆', :type => :operator}, '//' => {:value => '/', :type => :operator}, '\\\\' => {:value => '\\', :type => :operator}, 'xx' => {:value => '×', :type => :operator}, '-:' => {:value => '÷', :type => :operator}, '@' => {:value => '⚬', :type => :operator}, 'o+' => {:value => '⊕', :type => :operator}, 'ox' => {:value => '⊗', :type => :operator}, 'o.' => {:value => '⊙', :type => :operator}, 'sum' => {:value => '∑', :type => :operator, :underover => true}, 'prod' => {:value => '∏', :type => :operator, :underover => true}, '^^' => {:value => '∧', :type => :operator}, '^^^' => {:value => '⋀', :type => :operator, :underover => true}, 'vv' => {:value => '∨', :type => :operator}, 'vvv' => {:value => '⋁', :type => :operator, :underover => true}, 'nn' => {:value => '∩', :type => :operator}, 'nnn' => {:value => '⋂', :type => :operator, :underover => true}, 'uu' => {:value => '∪', :type => :operator}, 'uuu' => {:value => '⋃', :type => :operator, :underover => true}, # Relation symbols '=' => {:value => '=', :type => :operator}, '!=' => {:value => '≠', :type => :operator}, ':=' => {:value => ':=', :type => :operator}, '<' => {:value => '<', :type => :operator}, 'lt' => {:value => '<', :type => :operator}, '>' => {:value => '>', :type => :operator}, 'gt' => {:value => '>', :type => :operator}, '<=' => {:value => '≤', :type => :operator}, 'lt=' => {:value => '≤', :type => :operator}, '>=' => {:value => '≥', :type => :operator}, 'geq' => {:value => '≥', :type => :operator}, '-<' => {:value => '≺', :type => :operator}, '-lt' => {:value => '≺', :type => :operator}, '>-' => {:value => '≻', :type => :operator}, '-<=' => {:value => '⪯', :type => :operator}, '>-=' => {:value => '⪰', :type => :operator}, 'in' => {:value => '∈', :type => :operator}, '!in' => {:value => '∉', :type => :operator}, 'sub' => {:value => '⊂', :type => :operator}, 'sup' => {:value => '⊃', :type => :operator}, 'sube' => {:value => '⊆', :type => :operator}, 'supe' => {:value => '⊇', :type => :operator}, '-=' => {:value => '≡', :type => :operator}, '~=' => {:value => '≅', :type => :operator}, '~~' => {:value => '≈', :type => :operator}, 'prop' => {:value => '∝', :type => :operator}, # Logical symbols 'and' => {:value => 'and', :type => :text}, 'or' => {:value => 'or', :type => :text}, 'not' => {:value => '¬', :type => :operator}, '=>' => {:value => '⇒', :type => :operator}, 'if' => {:value => 'if', :type => :operator}, '<=>' => {:value => '⇔', :type => :operator}, 'AA' => {:value => '∀', :type => :operator}, 'EE' => {:value => '∃', :type => :operator}, '_|_' => {:value => '⊥', :type => :operator}, 'TT' => {:value => '⊤', :type => :operator}, '|--' => {:value => '⊢', :type => :operator}, '|==' => {:value => '⊨', :type => :operator}, # Grouping brackets '(' => {:value => '(', :type => :lparen}, ')' => {:value => ')', :type => :rparen}, '[' => {:value => '[', :type => :lparen}, ']' => {:value => ']', :type => :rparen}, '{' => {:value => '{', :type => :lparen}, '}' => {:value => '}', :type => :rparen}, '|' => {:value => '|', :type => :lrparen}, '||' => {:value => '||', :type => :lrparen}, '(:' => {:value => '〈', :type => :lparen}, ':)' => {:value => '〉', :type => :rparen}, '<<' => {:value => '〈', :type => :lparen}, '>>' => {:value => '〉', :type => :rparen}, '{:' => {:value => nil, :type => :lparen}, ':}' => {:value => nil, :type => :rparen}, # Miscellaneous symbols 'int' => {:value => '∫', :type => :operator}, 'dx' => {:value => 'dx', :type => :identifier}, 'dy' => {:value => 'dy', :type => :identifier}, 'dz' => {:value => 'dz', :type => :identifier}, 'dt' => {:value => 'dt', :type => :identifier}, 'oint' => {:value => '∮', :type => :operator}, 'del' => {:value => '∂', :type => :operator}, 'grad' => {:value => '∇', :type => :operator}, '+-' => {:value => '±', :type => :operator}, 'O/' => {:value => '∅', :type => :operator}, 'oo' => {:value => '∞', :type => :operator}, 'aleph' => {:value => 'ℵ', :type => :operator}, '...' => {:value => '...', :type => :operator}, ':.' => {:value => '∴', :type => :operator}, '/_' => {:value => '∠', :type => :operator}, '\\ ' => {:value => ' ', :type => :operator}, 'quad' => {:value => '\u00A0\u00A0', :type => :operator}, 'qquad' => {:value => '\u00A0\u00A0\u00A0\u00A0', :type => :operator}, 'cdots' => {:value => '⋯', :type => :operator}, 'vdots' => {:value => '⋮', :type => :operator}, 'ddots' => {:value => '⋱', :type => :operator}, 'diamond' => {:value => '⋄', :type => :operator}, 'square' => {:value => '□', :type => :operator}, '|__' => {:value => '⌊', :type => :operator}, '__|' => {:value => '⌋', :type => :operator}, '|~' => {:value => '⌈', :type => :operator}, '~|' => {:value => '⌉', :type => :operator}, 'CC' => {:value => 'ℂ', :type => :operator}, 'NN' => {:value => 'ℕ', :type => :operator}, 'QQ' => {:value => 'ℚ', :type => :operator}, 'RR' => {:value => 'ℝ', :type => :operator}, 'ZZ' => {:value => 'ℤ', :type => :operator}, 'f' => {:value => 'f', :type => :identifier}, 'g' => {:value => 'g', :type => :identifier}, # Standard functions 'lim' => {:value => 'lim', :type => :operator, :underover => true}, 'Lim' => {:value => 'Lim', :type => :operator, :underover => true}, 'sin' => {:value => 'sin', :type => :operator}, 'cos' => {:value => 'cos', :type => :operator}, 'tan' => {:value => 'tan', :type => :operator}, 'sinh' => {:value => 'sinh', :type => :operator}, 'cosh' => {:value => 'cosh', :type => :operator}, 'tanh' => {:value => 'tanh', :type => :operator}, 'cot' => {:value => 'cot', :type => :operator}, 'sec' => {:value => 'sec', :type => :operator}, 'csc' => {:value => 'csc', :type => :operator}, 'log' => {:value => 'log', :type => :operator}, 'ln' => {:value => 'ln', :type => :operator}, 'det' => {:value => 'det', :type => :operator}, 'dim' => {:value => 'dim', :type => :operator}, 'mod' => {:value => 'mod', :type => :operator}, 'gcd' => {:value => 'gcd', :type => :operator}, 'lcm' => {:value => 'lcm', :type => :operator}, 'lub' => {:value => 'lub', :type => :operator}, 'glb' => {:value => 'glb', :type => :operator}, 'min' => {:value => 'min', :type => :operator, :underover => true}, 'max' => {:value => 'max', :type => :operator, :underover => true}, # Accents 'hat' => {:value => '^', :type => :accent, :position => :over}, 'bar' => {:value => '¯', :type => :accent, :position => :over}, 'ul' => {:value => '_', :type => :accent, :position => :under}, 'vec' => {:value => '→', :type => :accent, :position => :over}, 'dot' => {:value => '.', :type => :accent, :position => :over}, 'ddot' => {:value => '..', :type => :accent, :position => :over}, # Arrows 'uarr' => {:value => '↑', :type => :operator}, 'darr' => {:value => '↓', :type => :operator}, 'rarr' => {:value => '→', :type => :operator}, '->' => {:value => '→', :type => :operator}, '>->' => {:value => '↣', :type => :operator}, '->>' => {:value => '↠', :type => :operator}, '>->>' => {:value => '⤖', :type => :operator}, '|->' => {:value => '↦', :type => :operator}, 'larr' => {:value => '←', :type => :operator}, 'harr' => {:value => '↔', :type => :operator}, 'rArr' => {:value => '⇒', :type => :operator}, 'lArr' => {:value => '⇐', :type => :operator}, 'hArr' => {:value => '⇔', :type => :operator}, # Other 'sqrt' => {:value => :sqrt, :type => :unary}, 'text' => {:value => :text, :type => :unary}, 'bb' => {:value => :bold, :type => :font}, 'bbb' => {:value => :double_struck, :type => :font}, 'ii' => {:value => :italic, :type => :font}, 'bii' => {:value => :bold_italic, :type => :font}, 'cc' => {:value => :script, :type => :font}, 'bcc' => {:value => :bold_script, :type => :font}, 'tt' => {:value => :monospace, :type => :font}, 'fr' => {:value => :fraktur, :type => :font}, 'bfr' => {:value => :bold_fraktur, :type => :font}, 'sf' => {:value => :sans_serif, :type => :font}, 'bsf' => {:value => :bold_sans_serif, :type => :font}, 'sfi' => {:value => :sans_serif_italic, :type => :font}, 'sfbi' => {:value => :sans_serif_bold_italic, :type => :font}, 'frac' => {:value => :frac, :type => :binary}, 'root' => {:value => :root, :type => :binary}, 'stackrel' => {:value => :over, :type => :binary}, '/' => {:value => :frac, :type => :infix}, '_' => {:value => :sub, :type => :infix}, '^' => {:value => :sup, :type => :infix}, # Greek letters 'alpha' => {:value => 'α', :type => :identifier}, 'Alpha' => {:value => 'Α', :type => :identifier}, 'beta' => {:value => 'β', :type => :identifier}, 'Beta' => {:value => 'Β', :type => :identifier}, 'gamma' => {:value => 'γ', :type => :identifier}, 'Gamma' => {:value => 'Γ', :type => :operator}, 'delta' => {:value => 'δ', :type => :identifier}, 'Delta' => {:value => 'Δ', :type => :operator}, 'epsilon' => {:value => 'ε', :type => :identifier}, 'Epsilon' => {:value => 'Ε', :type => :identifier}, 'varepsilon' => {:value => 'ɛ', :type => :identifier}, 'zeta' => {:value => 'ζ', :type => :identifier}, 'Zeta' => {:value => 'Ζ', :type => :identifier}, 'eta' => {:value => 'η', :type => :identifier}, 'Eta' => {:value => 'Η', :type => :identifier}, 'theta' => {:value => 'θ', :type => :identifier}, 'Theta' => {:value => 'Θ', :type => :operator}, 'vartheta' => {:value => 'ϑ', :type => :identifier}, 'iota' => {:value => 'ι', :type => :identifier}, 'Iota' => {:value => 'Ι', :type => :identifier}, 'kappa' => {:value => 'κ', :type => :identifier}, 'Kappa' => {:value => 'Κ', :type => :identifier}, 'lambda' => {:value => 'λ', :type => :identifier}, 'Lambda' => {:value => 'Λ', :type => :operator}, 'mu' => {:value => 'μ', :type => :identifier}, 'Mu' => {:value => 'Μ', :type => :identifier}, 'nu' => {:value => 'ν', :type => :identifier}, 'Nu' => {:value => 'Ν', :type => :identifier}, 'xi' => {:value => 'ξ', :type => :identifier}, 'Xi' => {:value => 'Ξ', :type => :operator}, 'omicron' => {:value => 'ο', :type => :identifier}, 'Omicron' => {:value => 'Ο', :type => :identifier}, 'pi' => {:value => 'π', :type => :identifier}, 'Pi' => {:value => 'Π', :type => :operator}, 'rho' => {:value => 'ρ', :type => :identifier}, 'Rho' => {:value => 'Ρ', :type => :identifier}, 'sigma' => {:value => 'σ', :type => :identifier}, 'Sigma' => {:value => 'Σ', :type => :operator}, 'tau' => {:value => 'τ', :type => :identifier}, 'Tau' => {:value => 'Τ', :type => :identifier}, 'upsilon' => {:value => 'υ', :type => :identifier}, 'Upsilon' => {:value => 'Υ', :type => :identifier}, 'phi' => {:value => 'φ', :type => :identifier}, 'Phi' => {:value => 'Φ', :type => :identifier}, 'varphi' => {:value => 'ϕ', :type => :identifier}, 'chi' => {:value => '\u03b3c7', :type => :identifier}, 'Chi' => {:value => '\u0393a7', :type => :identifier}, 'psi' => {:value => 'ψ', :type => :identifier}, 'Psi' => {:value => 'Ψ', :type => :identifier}, 'omega' => {:value => 'ω', :type => :identifier}, 'Omega' => {:value => 'Ω', :type => :operator}, } def parse(input) Expression.new( input, parse_expression(Tokenizer.new(input, SYMBOLS), 0) ) end private def parse_expression(tok, depth) e = [] while (s1 = parse_simple_expression(tok, depth)) t1 = tok.next_token if t1[:type] == :infix s2 = parse_simple_expression(tok, depth) t2 = tok.next_token if t1[:value] == :sub && t2[:value] == :sup s3 = parse_simple_expression(tok, depth) operator = s1[:underover] ? :underover : :subsup e << {:type => :ternary, :operator => operator, :s1 => s1, :s2 => s2, :s3 => s3} else operator = s1[:underover] ? (t1[:value] == :sub ? :under : :over) : t1[:value] e << {:type => :binary, :operator => operator, :s1 => s1, :s2 => s2} tok.push_back(t2) if (t2[:type] == :lrparen || t2[:type] == :rparen) && depth > 0 break end end elsif t1[:type] == :eof e << s1 break else e << s1 tok.push_back(t1) if (t1[:type] == :lrparen || t1[:type] == :rparen) && depth > 0 break end end end e end def parse_simple_expression(tok, depth) t1 = tok.next_token case t1[:type] when :lparen, :lrparen t2 = tok.next_token case t2[:type] when :rparen, :lrparen {:type => :paren, :e => nil, :lparen => t1[:value], :rparen => t2[:value]} else tok.push_back(t2) e = parse_expression(tok, depth + 1) t2 = tok.next_token case t2[:type] when :rparen, :lrparen convert_to_matrix({:type => :paren, :e => e, :lparen => t1[:value], :rparen => t2[:value]}) else tok.push_back(t2) {:type => :paren, :e => e, :lparen => t1[:value]} end end when :accent s = parse_simple_expression(tok, depth) {:type => :binary, :s1 => s, :s2 => {:type => :operator, :c => t1[:value]}, :operator => t1[:position]} when :unary, :font s = parse_simple_expression(tok, depth) {:type => t1[:type], :s => s, :operator => t1[:value]} when :binary s1 = parse_simple_expression(tok, depth) s2 = parse_simple_expression(tok, depth) {:type => :binary, :s1 => s1, :s2 => s2, :operator => t1[:value]} when :eof nil else {:type => t1[:type], :c => t1[:value], :underover => t1[:underover]} end end def convert_to_matrix(expression) return expression unless matrix? expression rows = expression[:e].select.with_index { |obj, i| i.even? }.map do |row| row[:e].select.with_index { |obj, i| i.even? } end {:type => :matrix, :rows => rows, :lparen => expression[:lparen], :rparen => expression[:rparen]} end def matrix?(expression) return false unless expression.is_a?(Hash) && expression[:type] == :paren rows, separators = expression[:e].partition.with_index { |obj, i| i.even? } rows.length > 1 && rows.length > separators.length && separators.all? { |item| item[:type] == :identifier && item[:c] == ',' } && (rows.all? { |item| item[:type] == :paren && item[:lparen] == '(' && item[:rparen] == ')' } || rows.all? { |item| item[:type] == :paren && item[:lparen] == '[' && item[:rparen] == ']' }) && rows.all? { |item| item[:e].length == rows[0][:e].length } && rows.all? { |item| matrix_cols?(item[:e]) } end def matrix_cols?(expression) return false unless expression.is_a?(Array) cols, separators = expression.partition.with_index { |obj, i| i.even? } cols.all? { |item| item[:type] != :identifier || item[:c] != ',' } && separators.all? { |item| item[:type] == :identifier && item[:c] == ',' } end end class Expression def initialize(asciimath, parsed_expression) @asciimath = asciimath @parsed_expression = parsed_expression end def to_s @asciimath end end def self.parse(asciimath) Parser.new.parse(asciimath) end end