lib/io_streams/line/reader.rb in iostreams-0.15.0 vs lib/io_streams/line/reader.rb in iostreams-0.16.0

- old
+ new

@@ -1,125 +1,181 @@ module IOStreams module Line class Reader - attr_reader :delimiter, :buffer_size, :encoding, :strip_non_printable + attr_reader :delimiter, :buffer_size, :line_count + # Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read. + MAX_BLOCKS_MULTIPLIER = 100 + + LINEFEED_REGEXP = Regexp.compile(/\r\n|\n|\r/).freeze + # Read a line at a time from a file or stream def self.open(file_name_or_io, **args) if file_name_or_io.is_a?(String) IOStreams::File::Reader.open(file_name_or_io) { |io| yield new(io, **args) } else yield new(file_name_or_io, **args) end end - # Create a delimited UTF8 stream reader from the supplied input streams + # Create a delimited stream reader from the supplied input stream. # - # The input stream should be binary with no text conversions performed - # since `strip_non_printable` will be applied to the binary stream before - # converting to UTF-8 + # Lines returned will be in the encoding of the input stream. + # To change the encoding of retruned lines, use IOStreams::Encode::Reader. # # Parameters # input_stream # The input stream that implements #read # # delimiter: [String] # Line / Record delimiter to use to break the stream up into records - # Any string to break the stream up by - # The records when saved will not include this delimiter + # Any string to break the stream up by. + # This delimiter is removed from each line when `#each` or `#readline` is called. # Default: nil # Automatically detect line endings and break up by line # Searches for the first "\r\n" or "\n" and then uses that as the - # delimiter for all subsequent records + # delimiter for all subsequent records. # # buffer_size: [Integer] - # Maximum size of the buffer into which to read the stream into for - # processing. - # Must be large enough to hold the entire first line and its delimiter(s) + # Size of blocks to read from the input stream at a time. # Default: 65536 ( 64K ) # - # strip_non_printable: [true|false] - # Strip all non-printable characters read from the file - # Default: false - # - # encoding: - # Force encoding to this encoding for all data being read - # Default: UTF8_ENCODING - # Set to nil to disable encoding - # # TODO: + # - Handle embedded line feeds when reading csv files. # - Skip Comment lines. RegExp? # - Skip "empty" / "blank" lines. RegExp? # - Extract header line(s) / first non-comment, non-blank line # - Embedded newline support, RegExp? or Proc? - def initialize(input_stream, delimiter: nil, buffer_size: 65536, encoding: UTF8_ENCODING, strip_non_printable: false) - @input_stream = input_stream - @delimiter = delimiter - @buffer_size = buffer_size - @encoding = encoding - @strip_non_printable = strip_non_printable + def initialize(input_stream, delimiter: nil, buffer_size: 65_536) + @input_stream = input_stream + @buffer_size = buffer_size - @delimiter.encode(UTF8_ENCODING) if @delimiter && @encoding - @buffer = '' + # More efficient read buffering only supported when the input stream `#read` method supports it. + @use_read_cache_buffer = !@input_stream.method(:read).arity.between?(0, 1) + + @line_count = 0 + @eof = false + @read_cache_buffer = nil + @buffer = nil + + read_block + # Auto-detect windows/linux line endings if not supplied. \n or \r\n + @delimiter = delimiter || auto_detect_line_endings + + unless eof? + # Change the delimiters encoding to match that of the input stream + @delimiter = @delimiter.encode(@buffer.encoding) + @delimiter_size = @delimiter.size + end end # Iterate over every line in the file/stream passing each line to supplied block in turn. # Returns [Integer] the number of lines read from the file/stream. - def each(&block) - partial = nil - loop do - if read_chunk == 0 - block.call(partial) if partial - return - end + # Note: + # * The line delimiter is _not_ returned. + def each + yield(readline) until eof? + line_count + end - self.delimiter ||= detect_delimiter - end_index ||= (delimiter.size + 1) * -1 + def readline + return if eof? - @buffer.each_line(delimiter) do |line| - if line.end_with?(delimiter) - # Strip off delimiter - block.call(line[0..end_index]) - partial = nil - else - partial = line - end - end - @buffer = partial.nil? ? '' : partial + # Keep reading until it finds the delimiter + while (index = @buffer.index(@delimiter)).nil? && read_block end + + # Delimiter found? + if index + data = @buffer.slice(0, index) + @buffer = @buffer.slice(index + @delimiter_size, @buffer.size) + elsif @eof && @buffer.empty? + data = nil + @buffer = nil + else + # Last line without delimiter + data = @buffer + @buffer = nil + end + + @line_count += 1 + data end + # Returns whether the end of file has been reached for this stream + def eof? + @eof && (@buffer.nil? || @buffer.empty?) + end + private - attr_reader :buffer - attr_writer :delimiter + # Returns [Integer] the number of characters read into the internal buffer + # Returns 0 on EOF + def read_block + return false if @eof - NOT_PRINTABLE = Regexp.compile(/[^[:print:]|\r|\n]/) + block = + if @read_cache_buffer + begin + @input_stream.read(@buffer_size, @read_cache_buffer) + rescue ArgumentError + # Handle arity of -1 when just 0..1 + @read_cache_buffer = nil + @input_stream.read(@buffer_size) + end + else + @input_stream.read(@buffer_size) + end - # Returns [Integer] the number of bytes read into the internal buffer - # Returns 0 on EOF - def read_chunk - # TODO: read into existing buffer - chunk = @input_stream.read(@buffer_size) # EOF reached? - return 0 unless chunk + if block.nil? + @eof = true + return false + elsif block.size < @buffer_size + @eof = true + end - # Strip out non-printable characters before converting to UTF-8 - chunk.gsub!(NOT_PRINTABLE, '') if @strip_non_printable + if @buffer + @buffer << block + else + # Take on the encoding from the input stream + @buffer = block.dup + # Take on the encoding from the first block that was read. + @read_cache_buffer = ''.encode(block.encoding) if @use_read_cache_buffer + end - @buffer << (@encoding ? chunk.force_encoding(@encoding) : chunk) - chunk.size + if @buffer.size > MAX_BLOCKS_MULTIPLIER * @buffer_size + raise( + Errors::DelimiterNotFound, + "Delimiter: #{@delimiter.inspect} not found after reading #{@buffer.size} bytes." + ) + end + + true end - # Auto detect text line delimiter - def detect_delimiter - if @buffer =~ /\r\n|\n\r|\n|\r/ - $& - elsif @buffer.size <= @buffer_size - # Handle one line files that are smaller than the buffer size - "\n" + # Auto-detect windows/linux line endings: \n, \r or \r\n + def auto_detect_line_endings + return "\n" if @buffer.nil? && !read_block + + # Could be "\r\n" broken in half by the block size + read_block if @buffer[-1] == "\r" + + # Delimiter takes on the encoding from @buffer + delimiter = @buffer.slice(LINEFEED_REGEXP) + return delimiter if delimiter + + while read_block + # Could be "\r\n" broken in half by the block size + read_block if @buffer[-1] == "\r" + + # Delimiter takes on the encoding from @buffer + delimiter = @buffer.slice(LINEFEED_REGEXP) + return delimiter if delimiter end - end + # One line files with no delimiter + "\n" + end end end end