lib/io_streams/line/reader.rb in iostreams-0.15.0 vs lib/io_streams/line/reader.rb in iostreams-0.16.0
- old
+ new
@@ -1,125 +1,181 @@
module IOStreams
module Line
class Reader
- attr_reader :delimiter, :buffer_size, :encoding, :strip_non_printable
+ attr_reader :delimiter, :buffer_size, :line_count
+ # Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
+ MAX_BLOCKS_MULTIPLIER = 100
+
+ LINEFEED_REGEXP = Regexp.compile(/\r\n|\n|\r/).freeze
+
# Read a line at a time from a file or stream
def self.open(file_name_or_io, **args)
if file_name_or_io.is_a?(String)
IOStreams::File::Reader.open(file_name_or_io) { |io| yield new(io, **args) }
else
yield new(file_name_or_io, **args)
end
end
- # Create a delimited UTF8 stream reader from the supplied input streams
+ # Create a delimited stream reader from the supplied input stream.
#
- # The input stream should be binary with no text conversions performed
- # since `strip_non_printable` will be applied to the binary stream before
- # converting to UTF-8
+ # Lines returned will be in the encoding of the input stream.
+ # To change the encoding of retruned lines, use IOStreams::Encode::Reader.
#
# Parameters
# input_stream
# The input stream that implements #read
#
# delimiter: [String]
# Line / Record delimiter to use to break the stream up into records
- # Any string to break the stream up by
- # The records when saved will not include this delimiter
+ # Any string to break the stream up by.
+ # This delimiter is removed from each line when `#each` or `#readline` is called.
# Default: nil
# Automatically detect line endings and break up by line
# Searches for the first "\r\n" or "\n" and then uses that as the
- # delimiter for all subsequent records
+ # delimiter for all subsequent records.
#
# buffer_size: [Integer]
- # Maximum size of the buffer into which to read the stream into for
- # processing.
- # Must be large enough to hold the entire first line and its delimiter(s)
+ # Size of blocks to read from the input stream at a time.
# Default: 65536 ( 64K )
#
- # strip_non_printable: [true|false]
- # Strip all non-printable characters read from the file
- # Default: false
- #
- # encoding:
- # Force encoding to this encoding for all data being read
- # Default: UTF8_ENCODING
- # Set to nil to disable encoding
- #
# TODO:
+ # - Handle embedded line feeds when reading csv files.
# - Skip Comment lines. RegExp?
# - Skip "empty" / "blank" lines. RegExp?
# - Extract header line(s) / first non-comment, non-blank line
# - Embedded newline support, RegExp? or Proc?
- def initialize(input_stream, delimiter: nil, buffer_size: 65536, encoding: UTF8_ENCODING, strip_non_printable: false)
- @input_stream = input_stream
- @delimiter = delimiter
- @buffer_size = buffer_size
- @encoding = encoding
- @strip_non_printable = strip_non_printable
+ def initialize(input_stream, delimiter: nil, buffer_size: 65_536)
+ @input_stream = input_stream
+ @buffer_size = buffer_size
- @delimiter.encode(UTF8_ENCODING) if @delimiter && @encoding
- @buffer = ''
+ # More efficient read buffering only supported when the input stream `#read` method supports it.
+ @use_read_cache_buffer = !@input_stream.method(:read).arity.between?(0, 1)
+
+ @line_count = 0
+ @eof = false
+ @read_cache_buffer = nil
+ @buffer = nil
+
+ read_block
+ # Auto-detect windows/linux line endings if not supplied. \n or \r\n
+ @delimiter = delimiter || auto_detect_line_endings
+
+ unless eof?
+ # Change the delimiters encoding to match that of the input stream
+ @delimiter = @delimiter.encode(@buffer.encoding)
+ @delimiter_size = @delimiter.size
+ end
end
# Iterate over every line in the file/stream passing each line to supplied block in turn.
# Returns [Integer] the number of lines read from the file/stream.
- def each(&block)
- partial = nil
- loop do
- if read_chunk == 0
- block.call(partial) if partial
- return
- end
+ # Note:
+ # * The line delimiter is _not_ returned.
+ def each
+ yield(readline) until eof?
+ line_count
+ end
- self.delimiter ||= detect_delimiter
- end_index ||= (delimiter.size + 1) * -1
+ def readline
+ return if eof?
- @buffer.each_line(delimiter) do |line|
- if line.end_with?(delimiter)
- # Strip off delimiter
- block.call(line[0..end_index])
- partial = nil
- else
- partial = line
- end
- end
- @buffer = partial.nil? ? '' : partial
+ # Keep reading until it finds the delimiter
+ while (index = @buffer.index(@delimiter)).nil? && read_block
end
+
+ # Delimiter found?
+ if index
+ data = @buffer.slice(0, index)
+ @buffer = @buffer.slice(index + @delimiter_size, @buffer.size)
+ elsif @eof && @buffer.empty?
+ data = nil
+ @buffer = nil
+ else
+ # Last line without delimiter
+ data = @buffer
+ @buffer = nil
+ end
+
+ @line_count += 1
+ data
end
+ # Returns whether the end of file has been reached for this stream
+ def eof?
+ @eof && (@buffer.nil? || @buffer.empty?)
+ end
+
private
- attr_reader :buffer
- attr_writer :delimiter
+ # Returns [Integer] the number of characters read into the internal buffer
+ # Returns 0 on EOF
+ def read_block
+ return false if @eof
- NOT_PRINTABLE = Regexp.compile(/[^[:print:]|\r|\n]/)
+ block =
+ if @read_cache_buffer
+ begin
+ @input_stream.read(@buffer_size, @read_cache_buffer)
+ rescue ArgumentError
+ # Handle arity of -1 when just 0..1
+ @read_cache_buffer = nil
+ @input_stream.read(@buffer_size)
+ end
+ else
+ @input_stream.read(@buffer_size)
+ end
- # Returns [Integer] the number of bytes read into the internal buffer
- # Returns 0 on EOF
- def read_chunk
- # TODO: read into existing buffer
- chunk = @input_stream.read(@buffer_size)
# EOF reached?
- return 0 unless chunk
+ if block.nil?
+ @eof = true
+ return false
+ elsif block.size < @buffer_size
+ @eof = true
+ end
- # Strip out non-printable characters before converting to UTF-8
- chunk.gsub!(NOT_PRINTABLE, '') if @strip_non_printable
+ if @buffer
+ @buffer << block
+ else
+ # Take on the encoding from the input stream
+ @buffer = block.dup
+ # Take on the encoding from the first block that was read.
+ @read_cache_buffer = ''.encode(block.encoding) if @use_read_cache_buffer
+ end
- @buffer << (@encoding ? chunk.force_encoding(@encoding) : chunk)
- chunk.size
+ if @buffer.size > MAX_BLOCKS_MULTIPLIER * @buffer_size
+ raise(
+ Errors::DelimiterNotFound,
+ "Delimiter: #{@delimiter.inspect} not found after reading #{@buffer.size} bytes."
+ )
+ end
+
+ true
end
- # Auto detect text line delimiter
- def detect_delimiter
- if @buffer =~ /\r\n|\n\r|\n|\r/
- $&
- elsif @buffer.size <= @buffer_size
- # Handle one line files that are smaller than the buffer size
- "\n"
+ # Auto-detect windows/linux line endings: \n, \r or \r\n
+ def auto_detect_line_endings
+ return "\n" if @buffer.nil? && !read_block
+
+ # Could be "\r\n" broken in half by the block size
+ read_block if @buffer[-1] == "\r"
+
+ # Delimiter takes on the encoding from @buffer
+ delimiter = @buffer.slice(LINEFEED_REGEXP)
+ return delimiter if delimiter
+
+ while read_block
+ # Could be "\r\n" broken in half by the block size
+ read_block if @buffer[-1] == "\r"
+
+ # Delimiter takes on the encoding from @buffer
+ delimiter = @buffer.slice(LINEFEED_REGEXP)
+ return delimiter if delimiter
end
- end
+ # One line files with no delimiter
+ "\n"
+ end
end
end
end