lib/io_streams/line/reader.rb in iostreams-0.16.2 vs lib/io_streams/line/reader.rb in iostreams-0.17.0
- old
+ new
@@ -1,9 +1,9 @@
module IOStreams
module Line
class Reader
- attr_reader :delimiter, :buffer_size, :line_count
+ attr_reader :delimiter, :buffer_size, :line_number
# Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
MAX_BLOCKS_MULTIPLIER = 100
LINEFEED_REGEXP = Regexp.compile(/\r\n|\n|\r/).freeze
@@ -18,11 +18,11 @@
end
# Create a delimited stream reader from the supplied input stream.
#
# Lines returned will be in the encoding of the input stream.
- # To change the encoding of retruned lines, use IOStreams::Encode::Reader.
+ # To change the encoding of returned lines, use IOStreams::Encode::Reader.
#
# Parameters
# input_stream
# The input stream that implements #read
#
@@ -43,18 +43,19 @@
# - Handle embedded line feeds when reading csv files.
# - Skip Comment lines. RegExp?
# - Skip "empty" / "blank" lines. RegExp?
# - Extract header line(s) / first non-comment, non-blank line
# - Embedded newline support, RegExp? or Proc?
- def initialize(input_stream, delimiter: nil, buffer_size: 65_536)
- @input_stream = input_stream
- @buffer_size = buffer_size
+ def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil)
+ @embedded_within = embedded_within
+ @input_stream = input_stream
+ @buffer_size = buffer_size
# More efficient read buffering only supported when the input stream `#read` method supports it.
@use_read_cache_buffer = !@input_stream.method(:read).arity.between?(0, 1)
- @line_count = 0
+ @line_number = 0
@eof = false
@read_cache_buffer = nil
@buffer = nil
read_block
@@ -71,47 +72,66 @@
# Iterate over every line in the file/stream passing each line to supplied block in turn.
# Returns [Integer] the number of lines read from the file/stream.
# Note:
# * The line delimiter is _not_ returned.
def each
+ line_count = 0
until eof?
line = readline
- yield(line) unless line.nil?
+ unless line.nil?
+ yield(line)
+ line_count += 1
+ end
end
line_count
end
+ # Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
+ # The embedded_within argument is set in IOStreams::LineReader
def readline
+ line = _readline
+ if line && @embedded_within
+ initial_line_number = @line_number
+ while line.count(@embedded_within).odd?
+ raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
+ line << @delimiter
+ line << _readline
+ end
+ end
+ line
+ end
+
+ # Returns whether the end of file has been reached for this stream
+ def eof?
+ @eof && (@buffer.nil? || @buffer.empty?)
+ end
+
+ private
+
+ def _readline
return if eof?
# Keep reading until it finds the delimiter
while (index = @buffer.index(@delimiter)).nil? && read_block
end
# Delimiter found?
if index
- data = @buffer.slice(0, index)
- @buffer = @buffer.slice(index + @delimiter_size, @buffer.size)
- @line_count += 1
+ data = @buffer.slice(0, index)
+ @buffer = @buffer.slice(index + @delimiter_size, @buffer.size)
+ @line_number += 1
elsif @eof && @buffer.empty?
data = nil
@buffer = nil
else
# Last line without delimiter
- data = @buffer
- @buffer = nil
- @line_count += 1
+ data = @buffer
+ @buffer = nil
+ @line_number += 1
end
data
end
-
- # Returns whether the end of file has been reached for this stream
- def eof?
- @eof && (@buffer.nil? || @buffer.empty?)
- end
-
- private
# Returns [Integer] the number of characters read into the internal buffer
# Returns 0 on EOF
def read_block
return false if @eof