stream.rb in stream_lines-0.4.0

- old
+ new

@@ -10,13 +10,14 @@
       include Enumerable
       include HTTParty
 
       raise_on 400..599
 
-      def initialize(url)
+      def initialize(url, encoding: Encoding.default_external)
         @url = url
-        @buffer = StringIO.new
+        @encoding = encoding
+        @buffer = String.new(encoding: @encoding)
       end
 
       def each(&block)
         stream_lines(&block)
       rescue HTTParty::Error => e
@@ -31,24 +32,38 @@
         self.class.get(url, stream_body: true) do |chunk|
           lines = extract_lines(chunk)
           lines.each { |line| block.call(line) }
         end
 
-        @buffer.rewind
-        block.call(@buffer.read) if @buffer.size.positive?
+        block.call(@buffer) if @buffer.size.positive?
       end
 
       def extract_lines(chunk)
-        lines = chunk.split($INPUT_RECORD_SEPARATOR, -1)
+        encoded_chunk = @buffer + chunk.to_s.dup.force_encoding(@encoding)
+        lines = split_lines(encoded_chunk)
+        @buffer = String.new(encoding: @encoding)
+        @buffer << lines.pop.to_s
 
-        if lines.length > 1
-          @buffer.rewind
-          lines.first.prepend(@buffer.read)
-          @buffer = StringIO.new
-        end
-
-        @buffer << lines.pop
         lines
+      end
+
+      def split_lines(encoded_chunk)
+        encoded_chunk.split($INPUT_RECORD_SEPARATOR, -1)
+      rescue ArgumentError => e
+        raise e unless /invalid byte sequence/.match?(e.message)
+
+        # NOTE: (jdlubrano)
+        # The last byte in the chunk is most likely a part of a multibyte
+        # character that, on its own, is an invalid byte sequence.  So, we
+        # want to split the lines containing all valid bytes and make the
+        # trailing bytes the last line.  The last line eventually gets added
+        # to the buffer, prepended to the next chunk, and, hopefully, restores
+        # a valid byte sequence.
+        last_newline_index = encoded_chunk.rindex($INPUT_RECORD_SEPARATOR)
+        return [encoded_chunk] if last_newline_index.nil?
+
+        valid_lines = encoded_chunk[0...last_newline_index].split($INPUT_RECORD_SEPARATOR, -1)
+        valid_lines + [encoded_chunk[(last_newline_index + 1)..-1]].compact
       end
     end
   end
 end