# frozen_string_literal: true module SmarterCSV module Parser protected ### ### Thin wrapper around C-extension ### ### NOTE: we are no longer passing-in header_size ### def parse(line, options, header_size = nil) # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose] if options[:acceleration] && has_acceleration # :nocov: has_quotes = line =~ /#{options[:quote_char]}/ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size) elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes [elements, elements.size] # :nocov: else # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]" parse_csv_line_ruby(line, options, header_size) end end # ------------------------------------------------------------------ # Ruby equivalent of the C-extension for parse_line # # parses a single line: either a CSV header and body line # - quoting rules compared to RFC-4180 are somewhat relaxed # - we are not assuming that quotes inside a fields need to be doubled # - we are not assuming that all fields need to be quoted (0 is even) # - works with multi-char col_sep # # NOTE: we are no longer passing-in header_size # # - if header_size was given, only up to header_size fields are parsed # # We used header_size for parsing the body lines to make sure we always match the number of headers # in case there are trailing col_sep characters in line # # the purpose of the max_size parameter was to handle a corner case where # CSV lines contain more fields than the header. In which case the remaining fields in the line were ignored # # Our convention is that empty fields are returned as empty strings, not as nil. def parse_csv_line_ruby(line, options, header_size = nil) return [[], 0] if line.nil? line_size = line.size col_sep = options[:col_sep] col_sep_size = col_sep.size quote = options[:quote_char] elements = [] start = 0 i = 0 backslash_count = 0 in_quotes = false while i < line_size # Check if the current position matches the column separator and we're not inside quotes if line[i...i+col_sep_size] == col_sep && !in_quotes break if !header_size.nil? && elements.size >= header_size elements << cleanup_quotes(line[start...i], quote) i += col_sep_size start = i backslash_count = 0 # Reset backslash count at the start of a new field else if line[i] == '\\' backslash_count += 1 else if line[i] == quote if backslash_count % 2 == 0 # Even number of backslashes means quote is not escaped in_quotes = !in_quotes end # Else, quote is escaped; do nothing end backslash_count = 0 # Reset after any character other than backslash end i += 1 end end # Check for unclosed quotes at the end of the line if in_quotes raise MalformedCSV, "Unclosed quoted field detected in line: #{line}" end # Process the remaining field if header_size.nil? || elements.size < header_size elements << cleanup_quotes(line[start..-1], quote) end [elements, elements.size] end def cleanup_quotes(field, quote) return field if field.nil? # Remove surrounding quotes if present if field.start_with?(quote) && field.end_with?(quote) field = field[1..-2] end # Replace double quotes with a single quote field.gsub!("#{quote * 2}", quote) field end end end