lib/smarter_csv/parser.rb in smarter_csv-1.12.1 vs lib/smarter_csv/parser.rb in smarter_csv-1.13.0
- old
+ new
@@ -5,10 +5,12 @@
protected
###
### Thin wrapper around C-extension
###
+ ### NOTE: we are no longer passing-in header_size
+ ###
def parse(line, options, header_size = nil)
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
if options[:acceleration] && has_acceleration
# :nocov:
@@ -29,62 +31,86 @@
# parses a single line: either a CSV header and body line
# - quoting rules compared to RFC-4180 are somewhat relaxed
# - we are not assuming that quotes inside a fields need to be doubled
# - we are not assuming that all fields need to be quoted (0 is even)
# - works with multi-char col_sep
- # - if header_size is given, only up to header_size fields are parsed
#
- # We use header_size for parsing the body lines to make sure we always match the number of headers
- # in case there are trailing col_sep characters in line
+ # NOTE: we are no longer passing-in header_size
#
- # Our convention is that empty fields are returned as empty strings, not as nil.
+ # - if header_size was given, only up to header_size fields are parsed
#
+ # We used header_size for parsing the body lines to make sure we always match the number of headers
+ # in case there are trailing col_sep characters in line
#
- # the purpose of the max_size parameter is to handle a corner case where
- # CSV lines contain more fields than the header.
- # In which case the remaining fields in the line are ignored
+ # the purpose of the max_size parameter was to handle a corner case where
+ # CSV lines contain more fields than the header. In which case the remaining fields in the line were ignored
#
+ # Our convention is that empty fields are returned as empty strings, not as nil.
+
def parse_csv_line_ruby(line, options, header_size = nil)
- return [] if line.nil?
+ return [[], 0] if line.nil?
line_size = line.size
col_sep = options[:col_sep]
col_sep_size = col_sep.size
quote = options[:quote_char]
- quote_count = 0
elements = []
start = 0
i = 0
- previous_char = ''
+ backslash_count = 0
+ in_quotes = false
+
while i < line_size
- if line[i...i+col_sep_size] == col_sep && quote_count.even?
+ # Check if the current position matches the column separator and we're not inside quotes
+ if line[i...i+col_sep_size] == col_sep && !in_quotes
break if !header_size.nil? && elements.size >= header_size
elements << cleanup_quotes(line[start...i], quote)
- previous_char = line[i]
- i += col_sep.size
+ i += col_sep_size
start = i
+ backslash_count = 0 # Reset backslash count at the start of a new field
else
- quote_count += 1 if line[i] == quote && previous_char != '\\'
- previous_char = line[i]
+ if line[i] == '\\'
+ backslash_count += 1
+ else
+ if line[i] == quote
+ if backslash_count % 2 == 0
+ # Even number of backslashes means quote is not escaped
+ in_quotes = !in_quotes
+ end
+ # Else, quote is escaped; do nothing
+ end
+ backslash_count = 0 # Reset after any character other than backslash
+ end
i += 1
end
end
- elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
+
+ # Check for unclosed quotes at the end of the line
+ if in_quotes
+ raise MalformedCSV, "Unclosed quoted field detected in line: #{line}"
+ end
+
+ # Process the remaining field
+ if header_size.nil? || elements.size < header_size
+ elements << cleanup_quotes(line[start..-1], quote)
+ end
+
[elements, elements.size]
end
def cleanup_quotes(field, quote)
return field if field.nil?
- # return if field !~ /#{quote}/ # this check can probably eliminated
-
+ # Remove surrounding quotes if present
if field.start_with?(quote) && field.end_with?(quote)
- field.delete_prefix!(quote)
- field.delete_suffix!(quote)
+ field = field[1..-2]
end
- field.gsub!("#{quote}#{quote}", quote)
+
+ # Replace double quotes with a single quote
+ field.gsub!("#{quote * 2}", quote)
+
field
end
end
end