lib/smarter_csv/parser.rb in smarter_csv-1.12.1 vs lib/smarter_csv/parser.rb in smarter_csv-1.13.0

- old
+ new

@@ -5,10 +5,12 @@ protected ### ### Thin wrapper around C-extension ### + ### NOTE: we are no longer passing-in header_size + ### def parse(line, options, header_size = nil) # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose] if options[:acceleration] && has_acceleration # :nocov: @@ -29,62 +31,86 @@ # parses a single line: either a CSV header and body line # - quoting rules compared to RFC-4180 are somewhat relaxed # - we are not assuming that quotes inside a fields need to be doubled # - we are not assuming that all fields need to be quoted (0 is even) # - works with multi-char col_sep - # - if header_size is given, only up to header_size fields are parsed # - # We use header_size for parsing the body lines to make sure we always match the number of headers - # in case there are trailing col_sep characters in line + # NOTE: we are no longer passing-in header_size # - # Our convention is that empty fields are returned as empty strings, not as nil. + # - if header_size was given, only up to header_size fields are parsed # + # We used header_size for parsing the body lines to make sure we always match the number of headers + # in case there are trailing col_sep characters in line # - # the purpose of the max_size parameter is to handle a corner case where - # CSV lines contain more fields than the header. - # In which case the remaining fields in the line are ignored + # the purpose of the max_size parameter was to handle a corner case where + # CSV lines contain more fields than the header. In which case the remaining fields in the line were ignored # + # Our convention is that empty fields are returned as empty strings, not as nil. + def parse_csv_line_ruby(line, options, header_size = nil) - return [] if line.nil? + return [[], 0] if line.nil? line_size = line.size col_sep = options[:col_sep] col_sep_size = col_sep.size quote = options[:quote_char] - quote_count = 0 elements = [] start = 0 i = 0 - previous_char = '' + backslash_count = 0 + in_quotes = false + while i < line_size - if line[i...i+col_sep_size] == col_sep && quote_count.even? + # Check if the current position matches the column separator and we're not inside quotes + if line[i...i+col_sep_size] == col_sep && !in_quotes break if !header_size.nil? && elements.size >= header_size elements << cleanup_quotes(line[start...i], quote) - previous_char = line[i] - i += col_sep.size + i += col_sep_size start = i + backslash_count = 0 # Reset backslash count at the start of a new field else - quote_count += 1 if line[i] == quote && previous_char != '\\' - previous_char = line[i] + if line[i] == '\\' + backslash_count += 1 + else + if line[i] == quote + if backslash_count % 2 == 0 + # Even number of backslashes means quote is not escaped + in_quotes = !in_quotes + end + # Else, quote is escaped; do nothing + end + backslash_count = 0 # Reset after any character other than backslash + end i += 1 end end - elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size + + # Check for unclosed quotes at the end of the line + if in_quotes + raise MalformedCSV, "Unclosed quoted field detected in line: #{line}" + end + + # Process the remaining field + if header_size.nil? || elements.size < header_size + elements << cleanup_quotes(line[start..-1], quote) + end + [elements, elements.size] end def cleanup_quotes(field, quote) return field if field.nil? - # return if field !~ /#{quote}/ # this check can probably eliminated - + # Remove surrounding quotes if present if field.start_with?(quote) && field.end_with?(quote) - field.delete_prefix!(quote) - field.delete_suffix!(quote) + field = field[1..-2] end - field.gsub!("#{quote}#{quote}", quote) + + # Replace double quotes with a single quote + field.gsub!("#{quote * 2}", quote) + field end end end