parser.rb in smarter_csv-1.13.0

- old
+ new

@@ -5,10 +5,12 @@
     protected
 
     ###
     ### Thin wrapper around C-extension
     ###
+    ### NOTE: we are no longer passing-in header_size
+    ###
     def parse(line, options, header_size = nil)
       # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
 
       if options[:acceleration] && has_acceleration
         # :nocov:
@@ -29,62 +31,86 @@
     # parses a single line: either a CSV header and body line
     # - quoting rules compared to RFC-4180 are somewhat relaxed
     # - we are not assuming that quotes inside a fields need to be doubled
     # - we are not assuming that all fields need to be quoted (0 is even)
     # - works with multi-char col_sep
-    # - if header_size is given, only up to header_size fields are parsed
     #
-    # We use header_size for parsing the body lines to make sure we always match the number of headers
-    # in case there are trailing col_sep characters in line
+    # NOTE: we are no longer passing-in header_size
     #
-    # Our convention is that empty fields are returned as empty strings, not as nil.
+    # - if header_size was given, only up to header_size fields are parsed
     #
+    #     We used header_size for parsing the body lines to make sure we always match the number of headers
+    #     in case there are trailing col_sep characters in line
     #
-    # the purpose of the max_size parameter is to handle a corner case where
-    # CSV lines contain more fields than the header.
-    # In which case the remaining fields in the line are ignored
+    #     the purpose of the max_size parameter was to handle a corner case where
+    #     CSV lines contain more fields than the header. In which case the remaining fields in the line were ignored
     #
+    # Our convention is that empty fields are returned as empty strings, not as nil.
+
     def parse_csv_line_ruby(line, options, header_size = nil)
-      return [] if line.nil?
+      return [[], 0] if line.nil?
 
       line_size = line.size
       col_sep = options[:col_sep]
       col_sep_size = col_sep.size
       quote = options[:quote_char]
-      quote_count = 0
       elements = []
       start = 0
       i = 0
 
-      previous_char = ''
+      backslash_count = 0
+      in_quotes = false
+
       while i < line_size
-        if line[i...i+col_sep_size] == col_sep && quote_count.even?
+        # Check if the current position matches the column separator and we're not inside quotes
+        if line[i...i+col_sep_size] == col_sep && !in_quotes
           break if !header_size.nil? && elements.size >= header_size
 
           elements << cleanup_quotes(line[start...i], quote)
-          previous_char = line[i]
-          i += col_sep.size
+          i += col_sep_size
           start = i
+          backslash_count = 0 # Reset backslash count at the start of a new field
         else
-          quote_count += 1 if line[i] == quote && previous_char != '\\'
-          previous_char = line[i]
+          if line[i] == '\\'
+            backslash_count += 1
+          else
+            if line[i] == quote
+              if backslash_count % 2 == 0
+                # Even number of backslashes means quote is not escaped
+                in_quotes = !in_quotes
+              end
+              # Else, quote is escaped; do nothing
+            end
+            backslash_count = 0 # Reset after any character other than backslash
+          end
           i += 1
         end
       end
-      elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
+
+      # Check for unclosed quotes at the end of the line
+      if in_quotes
+        raise MalformedCSV, "Unclosed quoted field detected in line: #{line}"
+      end
+
+      # Process the remaining field
+      if header_size.nil? || elements.size < header_size
+        elements << cleanup_quotes(line[start..-1], quote)
+      end
+
       [elements, elements.size]
     end
 
     def cleanup_quotes(field, quote)
       return field if field.nil?
 
-      # return if field !~ /#{quote}/ # this check can probably eliminated
-
+      # Remove surrounding quotes if present
       if field.start_with?(quote) && field.end_with?(quote)
-        field.delete_prefix!(quote)
-        field.delete_suffix!(quote)
+        field = field[1..-2]
       end
-      field.gsub!("#{quote}#{quote}", quote)
+
+      # Replace double quotes with a single quote
+      field.gsub!("#{quote * 2}", quote)
+
       field
     end
   end
 end