parser.rb in rack-3.0.0.beta1

- old
+ new

@@ -1,27 +1,55 @@
 # frozen_string_literal: true
 
 require 'strscan'
 
+require_relative '../utils'
+
 module Rack
   module Multipart
     class MultipartPartLimitError < Errno::EMFILE; end
-    class MultipartTotalPartLimitError < StandardError; end
 
-    class Parser
-      (require_relative '../core_ext/regexp'; using ::Rack::RegexpExtensions) if RUBY_VERSION < '2.4'
+    # Use specific error class when parsing multipart request
+    # that ends early.
+    class EmptyContentError < ::EOFError; end
 
+    # Base class for multipart exceptions that do not subclass from
+    # other exception classes for backwards compatibility.
+    class Error < StandardError; end
+
+    EOL = "\r\n"
+    MULTIPART = %r|\Amultipart/.*boundary=\"?([^\";,]+)\"?|ni
+    TOKEN = /[^\s()<>,;:\\"\/\[\]?=]+/
+    CONDISP = /Content-Disposition:\s*#{TOKEN}\s*/i
+    VALUE = /"(?:\\"|[^"])*"|#{TOKEN}/
+    BROKEN = /^#{CONDISP}.*;\s*filename=(#{VALUE})/i
+    MULTIPART_CONTENT_TYPE = /Content-Type: (.*)#{EOL}/ni
+    MULTIPART_CONTENT_DISPOSITION = /Content-Disposition:.*;\s*name=(#{VALUE})/ni
+    MULTIPART_CONTENT_ID = /Content-ID:\s*([^#{EOL}]*)/ni
+    # Updated definitions from RFC 2231
+    ATTRIBUTE_CHAR = %r{[^ \t\v\n\r)(><@,;:\\"/\[\]?='*%]}
+    ATTRIBUTE = /#{ATTRIBUTE_CHAR}+/
+    SECTION = /\*[0-9]+/
+    REGULAR_PARAMETER_NAME = /#{ATTRIBUTE}#{SECTION}?/
+    REGULAR_PARAMETER = /(#{REGULAR_PARAMETER_NAME})=(#{VALUE})/
+    EXTENDED_OTHER_NAME = /#{ATTRIBUTE}\*[1-9][0-9]*\*/
+    EXTENDED_OTHER_VALUE = /%[0-9a-fA-F]{2}|#{ATTRIBUTE_CHAR}/
+    EXTENDED_OTHER_PARAMETER = /(#{EXTENDED_OTHER_NAME})=(#{EXTENDED_OTHER_VALUE}*)/
+    EXTENDED_INITIAL_NAME = /#{ATTRIBUTE}(?:\*0)?\*/
+    EXTENDED_INITIAL_VALUE = /[a-zA-Z0-9\-]*'[a-zA-Z0-9\-]*'#{EXTENDED_OTHER_VALUE}*/
+    EXTENDED_INITIAL_PARAMETER = /(#{EXTENDED_INITIAL_NAME})=(#{EXTENDED_INITIAL_VALUE})/
+    EXTENDED_PARAMETER = /#{EXTENDED_INITIAL_PARAMETER}|#{EXTENDED_OTHER_PARAMETER}/
+    DISPPARM = /;\s*(?:#{REGULAR_PARAMETER}|#{EXTENDED_PARAMETER})\s*/
+    RFC2183 = /^#{CONDISP}(#{DISPPARM})+$/i
+
+    class Parser
       BUFSIZE = 1_048_576
       TEXT_PLAIN = "text/plain"
       TEMPFILE_FACTORY = lambda { |filename, content_type|
-        extension = ::File.extname(filename.gsub("\0", '%00'))[0, 129]
-
-        Tempfile.new(["RackMultipart", extension])
+        Tempfile.new(["RackMultipart", ::File.extname(filename.gsub("\0", '%00'))])
       }
 
-      BOUNDARY_REGEX = /\A([^\n]*(?:\n|\Z))/
-
       class BoundedIO # :nodoc:
         def initialize(io, content_length)
           @io             = io
           @content_length = content_length
           @cursor = 0
@@ -39,20 +67,16 @@
                 end
 
           if str
             @cursor += str.bytesize
           else
-            # Raise an error for mismatching Content-Length and actual contents
+            # Raise an error for mismatching content-length and actual contents
             raise EOFError, "bad content body"
           end
 
           str
         end
-
-        def rewind
-          @io.rewind
-        end
       end
 
       MultipartInfo = Struct.new :params, :tmp_files
       EMPTY         = MultipartInfo.new(nil, [])
 
@@ -67,22 +91,21 @@
         return EMPTY if 0 == content_length
 
         boundary = parse_boundary content_type
         return EMPTY unless boundary
 
+        if boundary.length > 70
+          # RFC 1521 Section 7.2.1 imposes a 70 character maximum for the boundary.
+          # Most clients use no more than 55 characters.
+          raise Error, "multipart boundary size too large (#{boundary.length} characters)"
+        end
+
         io = BoundedIO.new(io, content_length) if content_length
-        outbuf = String.new
 
         parser = new(boundary, tmpfile, bufsize, qp)
-        parser.on_read io.read(bufsize, outbuf)
+        parser.parse(io)
 
-        loop do
-          break if parser.state == :DONE
-          parser.on_read io.read(bufsize, outbuf)
-        end
-
-        io.rewind
         parser.result
       end
 
       class Collector
         class MimePart < Struct.new(:body, :head, :filename, :content_type, :name)
@@ -141,11 +164,11 @@
             klass = BufferPart
           end
 
           @mime_parts[mime_index] = klass.new(body, head, filename, content_type, name)
 
-          check_part_limits
+          check_open_files
         end
 
         def on_mime_body(mime_index, content)
           @mime_parts[mime_index].body << content
         end
@@ -153,97 +176,104 @@
         def on_mime_finish(mime_index)
         end
 
         private
 
-        def check_part_limits
-          file_limit = Utils.multipart_file_limit
-          part_limit = Utils.multipart_total_part_limit
-
-          if file_limit && file_limit > 0
-            if @open_files >= file_limit
+        def check_open_files
+          if Utils.multipart_part_limit > 0
+            if @open_files >= Utils.multipart_part_limit
               @mime_parts.each(&:close)
               raise MultipartPartLimitError, 'Maximum file multiparts in content reached'
             end
           end
-
-          if part_limit && part_limit > 0
-            if @mime_parts.size >= part_limit
-              @mime_parts.each(&:close)
-              raise MultipartTotalPartLimitError, 'Maximum total multiparts in content reached'
-            end
-          end
         end
       end
 
       attr_reader :state
 
       def initialize(boundary, tempfile, bufsize, query_parser)
         @query_parser   = query_parser
         @params         = query_parser.make_params
-        @boundary       = "--#{boundary}"
         @bufsize        = bufsize
 
-        @full_boundary = @boundary
-        @end_boundary = @boundary + '--'
         @state = :FAST_FORWARD
         @mime_index = 0
         @collector = Collector.new tempfile
 
         @sbuf = StringScanner.new("".dup)
-        @body_regex = /(?:#{EOL})?#{Regexp.quote(@boundary)}(?:#{EOL}|--)/m
-        @end_boundary_size = boundary.bytesize + 6 # (-- at start, -- at finish, EOL at end)
-        @rx_max_size = EOL.size + @boundary.bytesize + [EOL.size, '--'.size].max
+        @body_regex = /(?:#{EOL}|\A)--#{Regexp.quote(boundary)}(?:#{EOL}|--)/m
+        @rx_max_size = boundary.bytesize + 6 # (\r\n-- at start, either \r\n or -- at finish)
         @head_regex = /(.*?#{EOL})#{EOL}/m
       end
 
-      def on_read(content)
-        handle_empty_content!(content)
-        @sbuf.concat content
-        run_parser
+      def parse(io)
+        outbuf = String.new
+        read_data(io, outbuf)
+
+        loop do
+          status =
+            case @state
+            when :FAST_FORWARD
+              handle_fast_forward
+            when :CONSUME_TOKEN
+              handle_consume_token
+            when :MIME_HEAD
+              handle_mime_head
+            when :MIME_BODY
+              handle_mime_body
+            else # when :DONE
+              return
+            end
+
+          read_data(io, outbuf) if status == :want_read
+        end
       end
 
       def result
         @collector.each do |part|
           part.get_data do |data|
             tag_multipart_encoding(part.filename, part.content_type, part.name, data)
-            @query_parser.normalize_params(@params, part.name, data, @query_parser.param_depth_limit)
+            @query_parser.normalize_params(@params, part.name, data)
           end
         end
         MultipartInfo.new @params.to_params_hash, @collector.find_all(&:file?).map(&:body)
       end
 
       private
 
-      def run_parser
-        loop do
-          case @state
-          when :FAST_FORWARD
-            break if handle_fast_forward == :want_read
-          when :CONSUME_TOKEN
-            break if handle_consume_token == :want_read
-          when :MIME_HEAD
-            break if handle_mime_head == :want_read
-          when :MIME_BODY
-            break if handle_mime_body == :want_read
-          when :DONE
-            break
-          end
-        end
+      def dequote(str) # From WEBrick::HTTPUtils
+        ret = (/\A"(.*)"\Z/ =~ str) ? $1 : str.dup
+        ret.gsub!(/\\(.)/, "\\1")
+        ret
       end
 
-      def handle_fast_forward
-        tok = consume_boundary
+      def read_data(io, outbuf)
+        content = io.read(@bufsize, outbuf)
+        handle_empty_content!(content)
+        @sbuf.concat(content)
+      end
 
-        if tok == :END_BOUNDARY && @sbuf.pos == @end_boundary_size && @sbuf.eos?
-          # stop parsing a buffer if a buffer is only an end boundary.
-          @state = :DONE
-        elsif tok
-          @state = :MIME_HEAD
-        else
-          raise EOFError, "bad content body" if @sbuf.rest_size >= @bufsize
-          :want_read
+      # This handles the initial parser state.  We read until we find the starting
+      # boundary, then we can transition to the next state. If we find the ending
+      # boundary, this is an invalid multipart upload, but keep scanning for opening
+      # boundary in that case. If no boundary found, we need to keep reading data
+      # and retry. It's highly unlikely the initial read will not consume the
+      # boundary.  The client would have to deliberately craft a response
+      # with the opening boundary beyond the buffer size for that to happen.
+      def handle_fast_forward
+        while true
+          case consume_boundary
+          when :BOUNDARY
+            # found opening boundary, transition to next state
+            @state = :MIME_HEAD
+            return
+          when :END_BOUNDARY
+            # invalid multipart upload, but retry for opening boundary
+          else
+            # no boundary found, keep reading data
+            return :want_read
+          end
         end
       end
 
       def handle_consume_token
         tok = consume_boundary
@@ -258,11 +288,11 @@
       def handle_mime_head
         if @sbuf.scan_until(@head_regex)
           head = @sbuf[1]
           content_type = head[MULTIPART_CONTENT_TYPE, 1]
           if name = head[MULTIPART_CONTENT_DISPOSITION, 1]
-            name = Rack::Auth::Digest::Params::dequote(name)
+            name = dequote(name)
           else
             name = head[MULTIPART_CONTENT_ID, 1]
           end
 
           filename = get_filename(head)
@@ -295,32 +325,33 @@
           end
           :want_read
         end
       end
 
-      def full_boundary; @full_boundary; end
-
+      # Scan until the we find the start or end of the boundary.
+      # If we find it, return the appropriate symbol for the start or
+      # end of the boundary.  If we don't find the start or end of the
+      # boundary, clear the buffer and return nil.
       def consume_boundary
-        while read_buffer = @sbuf.scan_until(BOUNDARY_REGEX)
-          case read_buffer.strip
-          when full_boundary then return :BOUNDARY
-          when @end_boundary then return :END_BOUNDARY
-          end
-          return if @sbuf.eos?
+        if read_buffer = @sbuf.scan_until(@body_regex)
+          read_buffer.end_with?(EOL) ? :BOUNDARY : :END_BOUNDARY
+        else
+          @sbuf.terminate
+          nil
         end
       end
 
       def get_filename(head)
         filename = nil
         case head
         when RFC2183
           params = Hash[*head.scan(DISPPARM).flat_map(&:compact)]
 
-          if filename = params['filename']
-            filename = $1 if filename =~ /^"(.*)"$/
-          elsif filename = params['filename*']
+          if filename = params['filename*']
             encoding, _, filename = filename.split("'", 3)
+          elsif filename = params['filename']
+            filename = $1 if filename =~ /^"(.*)"$/
           end
         when BROKEN
           filename = $1
           filename = $1 if filename =~ /^"(.*)"$/
         end
@@ -343,10 +374,11 @@
 
         filename
       end
 
       CHARSET = "charset"
+      deprecate_constant :CHARSET
 
       def tag_multipart_encoding(filename, content_type, name, body)
         name = name.to_s
         encoding = Encoding::UTF_8
 
@@ -363,21 +395,27 @@
             rest.each do |param|
               k, v = param.split('=', 2)
               k.strip!
               v.strip!
               v = v[1..-2] if v.start_with?('"') && v.end_with?('"')
-              encoding = Encoding.find v if k == CHARSET
+              if k == "charset"
+                encoding = begin
+                  Encoding.find v
+                rescue ArgumentError
+                  Encoding::BINARY
+                end
+              end
             end
           end
         end
 
         name.force_encoding(encoding)
         body.force_encoding(encoding)
       end
 
       def handle_empty_content!(content)
         if content.nil? || content.empty?
-          raise EOFError
+          raise EmptyContentError
         end
       end
     end
   end
 end