lib/stream_lines/reading/stream.rb in stream_lines-0.3.1 vs lib/stream_lines/reading/stream.rb in stream_lines-0.4.0
- old
+ new
@@ -10,13 +10,14 @@
include Enumerable
include HTTParty
raise_on 400..599
- def initialize(url)
+ def initialize(url, encoding: Encoding.default_external)
@url = url
- @buffer = StringIO.new
+ @encoding = encoding
+ @buffer = String.new(encoding: @encoding)
end
def each(&block)
stream_lines(&block)
rescue HTTParty::Error => e
@@ -31,24 +32,38 @@
self.class.get(url, stream_body: true) do |chunk|
lines = extract_lines(chunk)
lines.each { |line| block.call(line) }
end
- @buffer.rewind
- block.call(@buffer.read) if @buffer.size.positive?
+ block.call(@buffer) if @buffer.size.positive?
end
def extract_lines(chunk)
- lines = chunk.split($INPUT_RECORD_SEPARATOR, -1)
+ encoded_chunk = @buffer + chunk.to_s.dup.force_encoding(@encoding)
+ lines = split_lines(encoded_chunk)
+ @buffer = String.new(encoding: @encoding)
+ @buffer << lines.pop.to_s
- if lines.length > 1
- @buffer.rewind
- lines.first.prepend(@buffer.read)
- @buffer = StringIO.new
- end
-
- @buffer << lines.pop
lines
+ end
+
+ def split_lines(encoded_chunk)
+ encoded_chunk.split($INPUT_RECORD_SEPARATOR, -1)
+ rescue ArgumentError => e
+ raise e unless /invalid byte sequence/.match?(e.message)
+
+ # NOTE: (jdlubrano)
+ # The last byte in the chunk is most likely a part of a multibyte
+ # character that, on its own, is an invalid byte sequence. So, we
+ # want to split the lines containing all valid bytes and make the
+ # trailing bytes the last line. The last line eventually gets added
+ # to the buffer, prepended to the next chunk, and, hopefully, restores
+ # a valid byte sequence.
+ last_newline_index = encoded_chunk.rindex($INPUT_RECORD_SEPARATOR)
+ return [encoded_chunk] if last_newline_index.nil?
+
+ valid_lines = encoded_chunk[0...last_newline_index].split($INPUT_RECORD_SEPARATOR, -1)
+ valid_lines + [encoded_chunk[(last_newline_index + 1)..-1]].compact
end
end
end
end