lib/stream_lines/reading/stream.rb in stream_lines-0.3.1 vs lib/stream_lines/reading/stream.rb in stream_lines-0.4.0

- old
+ new

@@ -10,13 +10,14 @@ include Enumerable include HTTParty raise_on 400..599 - def initialize(url) + def initialize(url, encoding: Encoding.default_external) @url = url - @buffer = StringIO.new + @encoding = encoding + @buffer = String.new(encoding: @encoding) end def each(&block) stream_lines(&block) rescue HTTParty::Error => e @@ -31,24 +32,38 @@ self.class.get(url, stream_body: true) do |chunk| lines = extract_lines(chunk) lines.each { |line| block.call(line) } end - @buffer.rewind - block.call(@buffer.read) if @buffer.size.positive? + block.call(@buffer) if @buffer.size.positive? end def extract_lines(chunk) - lines = chunk.split($INPUT_RECORD_SEPARATOR, -1) + encoded_chunk = @buffer + chunk.to_s.dup.force_encoding(@encoding) + lines = split_lines(encoded_chunk) + @buffer = String.new(encoding: @encoding) + @buffer << lines.pop.to_s - if lines.length > 1 - @buffer.rewind - lines.first.prepend(@buffer.read) - @buffer = StringIO.new - end - - @buffer << lines.pop lines + end + + def split_lines(encoded_chunk) + encoded_chunk.split($INPUT_RECORD_SEPARATOR, -1) + rescue ArgumentError => e + raise e unless /invalid byte sequence/.match?(e.message) + + # NOTE: (jdlubrano) + # The last byte in the chunk is most likely a part of a multibyte + # character that, on its own, is an invalid byte sequence. So, we + # want to split the lines containing all valid bytes and make the + # trailing bytes the last line. The last line eventually gets added + # to the buffer, prepended to the next chunk, and, hopefully, restores + # a valid byte sequence. + last_newline_index = encoded_chunk.rindex($INPUT_RECORD_SEPARATOR) + return [encoded_chunk] if last_newline_index.nil? + + valid_lines = encoded_chunk[0...last_newline_index].split($INPUT_RECORD_SEPARATOR, -1) + valid_lines + [encoded_chunk[(last_newline_index + 1)..-1]].compact end end end end