#-- # BibTeX-Ruby # Copyright (C) 2010-2011 Sylvester Keil # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . #++ require 'strscan' module BibTeX # # The BibTeX::Lexer handles the lexical analysis of BibTeX bibliographies. # class Lexer extend Forwardable attr_reader :options, :stack, :mode, :scanner attr_writer :mode def_delegator :@scanner, :string, :data @defaults = { :include => [:errors], :strict => true, :allow_missing_keys => false, :strip => true }.freeze # Patterns Cache (#37: MacRuby does not cache regular expressions) @patterns = { :space => /[\s]+/o, :lbrace => /\s*\{/o, :rbrace => /\s*\}\s*/o, :braces => /\{|\}/o, :eq => /\s*=\s*/o, :comma => /\s*,\s*/o, :number => /\d+/o, :name => /[[:alpha:]\d\/:_!$\?\.%&\*-]+/io, :quote => /\s*"/o, :unquote => /[\{\}"]/o, :sharp => /\s*#\s*/o, :object => /@/o, :period => /./o, :strict_next => /@[\t ]*/o, :next => /(^|\n)[\t ]*@[\t ]*/o, :entry => /[a-z\d:_!\.$%&*-]+/io, :string => /string/io, :comment => /comment\b/io, :preamble => /preamble\b/io, :key => /[[:alpha:]\d \/:_!$\?\.%&\*-]+,/io, :optional_key => /[[:alpha:]\d \/:_!$\?\.%&\*-]*,/io }.freeze MODE = Hash.new(:meta).merge({ :bibtex => :bibtex, :entry => :bibtex, :string => :bibtex, :preamble => :bibtex, :comment => :bibtex, :meta => :meta, :literal => :literal, :content => :content }).freeze class << self attr_reader :defaults, :patterns end # # Creates a new instance. Possible options and their respective # default values are: # # - :include => [:errors] A list that may contain :meta_content, and # :errors; depending on whether or not these are present, the respective # tokens are included in the parse tree. # - :strict => true In strict mode objects can start anywhere; therefore # the `@' symbol is not possible except inside literals or @comment # objects; for a more lenient lexer set to false and objects are # expected to start after a new line (leading white space is permitted). # - :strip => true When enabled, newlines will be stripped from quoted # string values. # def initialize(options = {}) @options = Lexer.defaults.merge(options) reset end def reset @stack, @brace_level, @mode, @active_object = [], 0, :meta, nil @scanner.reset if @scanner # cache options for speed @include_meta_content = @options[:include].include?(:meta_content) @include_errors = @options[:include].include?(:errors) self end # Sets the source for the lexical analysis and resets the internal state. def data=(data) @scanner = StringScanner.new(data) reset end def symbols; @stack.map(&:first); end # Returns the next token from the parse stack. def next_token; @stack.shift; end # Returns true if the lexer is currenty parsing a BibTeX object. def bibtex_mode? MODE[@mode] == :bibtex end [:meta, :literal, :content].each do |m| define_method("#{m}_mode?") { @mode == m } end # Returns true if the lexer is currently parsing the given object type. def active?(object) @active_object == object end # Returns true if the lexer is currently in strict mode. def strict? !!@options[:strict] end def allow_missing_keys? !!@options[:allow_missing_keys] end def strip_line_breaks? !!options[:strip] && !active?(:comment) end # Pushes a value onto the parse stack. Returns the Lexer. def push(value) case value[0] when :CONTENT, :STRING_LITERAL value[1].gsub!(/\n\s*/, ' ') if strip_line_breaks? if !@stack.empty? && value[0] == @stack[-1][0] @stack[-1][1] << value[1] else @stack.push(value) end when :ERROR @stack.push(value) if @include_errors leave_object when :META_CONTENT @stack.push(value) if @include_meta_content else @stack.push(value) end self end # Start the lexical analysis. def analyse(string = nil) raise(ArgumentError, 'Lexer: failed to start analysis: no source given!') unless string || @scanner self.data = string || @scanner.string until @scanner.eos? send("parse_#{MODE[@mode]}") end push([false, '$end']) end private def parse_bibtex case when @scanner.scan(Lexer.patterns[:lbrace]) @brace_level += 1 push([:LBRACE,'{']) @mode = :content if @brace_level > 1 || @brace_level == 1 && active?(:comment) when @scanner.scan(Lexer.patterns[:rbrace]) @brace_level -= 1 push([:RBRACE,'}']) return leave_object if @brace_level == 0 return error_unbalanced_braces if @brace_level < 0 when @scanner.scan(Lexer.patterns[:eq]) push([:EQ,'=']) when @scanner.scan(Lexer.patterns[:comma]) push([:COMMA,',']) when @scanner.scan(Lexer.patterns[:number]) push([:NUMBER,@scanner.matched]) when @scanner.scan(Lexer.patterns[:name]) push([:NAME,@scanner.matched.rstrip]) when @scanner.scan(Lexer.patterns[:quote]) @mode = :literal when @scanner.scan(Lexer.patterns[:sharp]) push([:SHARP,'#']) when @scanner.scan(Lexer.patterns[:object]) enter_object when @scanner.scan(Lexer.patterns[:space]) # skip when @scanner.scan(Lexer.patterns[:period]) error_unexpected_token end end def parse_meta match = @scanner.scan_until(Lexer.patterns[strict? ? :strict_next : :next]) if @scanner.matched push([:META_CONTENT,match.chop]) enter_object else push([:META_CONTENT,@scanner.rest]) @scanner.terminate end end def parse_content match = @scanner.scan_until(Lexer.patterns[:braces]) case @scanner.matched when '{' @brace_level += 1 push([:CONTENT,match]) when '}' @brace_level -= 1 case when @brace_level == 0 push([:CONTENT,match.chop]) push([:RBRACE,'}']) leave_object when @brace_level == 1 && !active?(:comment) push([:CONTENT,match.chop]) push([:RBRACE,'}']) @mode = :bibtex when @brace_level < 0 push([:CONTENT,match.chop]) error_unbalanced_braces else push([:CONTENT,match]) end else push([:CONTENT,@scanner.rest]) @scanner.terminate error_unterminated_content end end def parse_literal match = @scanner.scan_until(Lexer.patterns[:unquote]) case @scanner.matched when '{' @brace_level += 1 push([:STRING_LITERAL,match]) when '}' @brace_level -= 1 if @brace_level < 1 push([:STRING_LITERAL,match.chop]) error_unbalanced_braces else push([:STRING_LITERAL,match]) end when '"' if @brace_level == 1 push([:STRING_LITERAL,match.chop]) @mode = :bibtex else push([:STRING_LITERAL,match]) end else push([:STRING_LITERAL,@scanner.rest]) @scanner.terminate error_unterminated_string end end # Called when the lexer encounters a new BibTeX object. def enter_object @brace_level = 0 push [:AT,'@'] case when @scanner.scan(Lexer.patterns[:string]) @mode = @active_object = :string push [:STRING, @scanner.matched] when @scanner.scan(Lexer.patterns[:preamble]) @mode = @active_object = :preamble push [:PREAMBLE, @scanner.matched] when @scanner.scan(Lexer.patterns[:comment]) @mode = @active_object = :comment push [:COMMENT, @scanner.matched] when @scanner.scan(Lexer.patterns[:entry]) @mode = @active_object = :entry push [:NAME, @scanner.matched] # TODO: DRY - try to parse key if @scanner.scan(Lexer.patterns[:lbrace]) @brace_level += 1 push([:LBRACE,'{']) @mode = :content if @brace_level > 1 || @brace_level == 1 && active?(:comment) if @scanner.scan(Lexer.patterns[allow_missing_keys? ? :optional_key : :key]) push [:KEY, @scanner.matched.chop.strip] end end else error_unexpected_object end end # Called when parser leaves a BibTeX object. def leave_object @mode, @active_object, @brace_level = :meta, nil, 0 end def error_unbalanced_braces BibTeX.log.warn("Lexer: unbalanced braces at #{@scanner.pos}; brace level #{@brace_level}; mode #{@mode.inspect}.") backtrace [:E_UNBALANCED, @scanner.matched] end def error_unterminated_string BibTeX.log.warn("Lexer: unterminated string at #{@scanner.pos}; brace level #{@brace_level}; mode #{@mode.inspect}.") backtrace [:E_UNTERMINATED_STRING, @scanner.matched] end def error_unterminated_content BibTeX.log.warn("Lexer: unterminated content at #{@scanner.pos}; brace level #{@brace_level}; mode #{@mode.inspect}.") backtrace [:E_UNTERMINATED_CONTENT, @scanner.matched] end def error_unexpected_token BibTeX.log.warn("Lexer: unexpected token `#{@scanner.matched}' at #{@scanner.pos}; brace level #{@brace_level}; mode #{@mode.inspect}.") backtrace [:E_UNEXPECTED_TOKEN, @scanner.matched] end def error_unexpected_object BibTeX.log.warn("Lexer: unexpected object at #{@scanner.pos}; brace level #{@brace_level}; mode #{@mode.inspect}.") backtrace [:E_UNEXPECTED_OBJECT, '@'] end def backtrace(error) bt = [] bt.unshift(@stack.pop) until @stack.empty? || (!bt.empty? && [:AT,:META_CONTENT].include?(bt[0][0])) bt << error push [:ERROR,bt] end end end