lib/ebnf/ll1/scanner.rb in ebnf-0.3.9 vs lib/ebnf/ll1/scanner.rb in ebnf-1.0.0

- old
+ new

@@ -1,5 +1,6 @@ +# coding: utf-8 require 'strscan' unless defined?(StringScanner) module EBNF::LL1 ## # Overload StringScanner with file operations @@ -7,12 +8,12 @@ # * Reloads scanner as required until EOF. # * Loads to a high-water and reloads when remaining size reaches a low-water. # # FIXME: Only implements the subset required by the Lexer for now. class Scanner < StringScanner - HIGH_WATER = 10240 - LOW_WATER = 2048 # Hopefully large enough to deal with long multi-line comments + HIGH_WATER = 512 * 1024 # Hopefully large enough to deal with long multi-line comments + LOW_WATER = 4 * 1024 ## # @return [IO, StringIO] attr_reader :input @@ -23,18 +24,18 @@ # @param [Hash{Symbol => Object}] options # @option options[Integer] :high_water (HIGH_WATER) # @option options[Integer] :low_water (LOW_WATER) # @return [Scanner] def initialize(input, options = {}) - @options = options.merge(:high_water => HIGH_WATER, :low_water => LOW_WATER) + @options = options.merge(high_water: HIGH_WATER, low_water: LOW_WATER) if input.respond_to?(:read) @input = input super("") feed_me else - super(input.to_s) + super(encode_utf8 input.to_s) end end ## # Returns the "rest" of the line, or the next line if at EOL (i.e. everything after the scan pointer). @@ -93,24 +94,34 @@ # @return [String] def scan(pattern) feed_me encode_utf8 super end - - private - # Maintain low-water mark - def feed_me - if rest_size < @options[:low_water] && @input && !@input.eof? - # Read up to high-water mark ensuring we're at an end of line + + ## + # Ensures that the input buffer is full to the high water mark, or end of file. Useful when matching tokens that may be longer than the low water mark + def ensure_buffer_full + # Read up to high-water mark ensuring we're at an end of line + if @input && !@input.eof? diff = @options[:high_water] - rest_size string = encode_utf8(@input.read(diff)) string << encode_utf8(@input.gets) unless @input.eof? self << string if string end end + private + # Maintain low-water mark + def feed_me + ensure_buffer_full if rest_size < @options[:low_water] + end + # Perform UTF-8 encoding of input def encode_utf8(string) - string.respond_to?(:force_encoding) ? string.force_encoding(Encoding::UTF_8) : string + if string && string.encoding != Encoding::UTF_8 + string = string.dup if string.frozen? + string.force_encoding(Encoding::UTF_8) + end + string end end end \ No newline at end of file