care.rb in format_parser-0.8.0

- old
+ new

@@ -2,53 +2,83 @@
 # possibly remote IO to parsers that tend to read (and skip)
 # in very small increments. This way, with a remote source that
 # is only available via HTTP, for example, we can have less
 # fetches and have them return more data for one fetch
 class Care
+  # Defines the size of a page in bytes that the Care will prefetch
   DEFAULT_PAGE_SIZE = 128 * 1024
 
+  # Wraps any given IO with Care caching superpowers. Supports the subset
+  # of IO declared in IOConstraint.
   class IOWrapper
+    # Creates a new IOWrapper around the given source IO
+    #
+    # @param io[#seek, #pos, #size] the IO to wrap
+    # @param page_size[Integer] the size of the cache page to use for this wrapper
     def initialize(io, page_size: DEFAULT_PAGE_SIZE)
       @cache = Cache.new(page_size)
       @io = io
       @pos = 0
     end
 
+    # Returns the size of the resource contained in the IO
+    #
+    # @return Integer
     def size
       @io.size
     end
 
+    # Seeks the IO to the given absolute offset from the start of the file/resource
+    #
+    # @param to[Integer] offset in the IO
+    # @return Integer
     def seek(to)
       @pos = to
     end
 
+    # Returns the current position/offset within the IO
+    #
+    # @return Integer
     def pos
       @pos
     end
 
+    # Returns at most `n_bytes` of data from the IO or less if less data was available
+    # before the EOF was hit
+    #
+    # @param n_bytes[Integer]
+    # @return [String, nil] the content read from the IO or `nil` if no data was available
     def read(n_bytes)
       return '' if n_bytes == 0 # As hardcoded for all Ruby IO objects
       raise ArgumentError, "negative length #{n_bytes} given" if n_bytes < 0 # also as per Ruby IO objects
       read = @cache.byteslice(@io, @pos, n_bytes)
       return unless read && !read.empty?
       @pos += read.bytesize
       read
     end
 
+    # Clears all the cached pages explicitly to help GC
+    #
+    # @return void
     def clear
       @cache.clear
     end
 
+    # Clears all the cached pages explicitly to help GC, and
+    # calls `#close` on the source IO if the IO responds to `#close`
+    #
+    # @return void
     def close
       clear
       @io.close if @io.respond_to?(:close)
     end
   end
 
   # Stores cached pages of data from the given IO as strings.
   # Pages are sized to be `page_size` or less (for the last page).
   class Cache
+    # Initializes a new cache pages container with pages of given size
     def initialize(page_size = DEFAULT_PAGE_SIZE)
       @page_size = page_size.to_i
       raise ArgumentError, 'The page size must be a positive Integer' unless @page_size > 0
       @pages = {}
       @lowest_known_empty_page = nil
@@ -57,10 +87,16 @@
     # Returns the maximum possible byte string that can be
     # recovered from the given `io` at the given offset.
     # If the IO has been exhausted, `nil` will be returned
     # instead. Will use the cached pages where available,
     # or fetch pages where necessary
+    #
+    # @param io[#seek, #read] the IO to read data from
+    # @param at[Integer] at which offset we have to read
+    # @param n_bytes[Integer] how many bytes we want to read/cache
+    # @return [String, nil] the content read from the IO or `nil` if no data was available
+    # @raise ArgumentError
     def byteslice(io, at, n_bytes)
       if n_bytes < 1
         raise ArgumentError, "The number of bytes to fetch must be a positive Integer, but was #{n_bytes}"
       end
       if at < 0
@@ -95,25 +131,33 @@
       # Returning an empty string from read() is very confusing for the caller,
       # and no builtins do this - if we are at EOF we should return nil
       slice if slice && !slice.empty?
     end
 
+    # Clears the page cache of all strings with data
+    #
+    # @return void
     def clear
       @pages.clear
     end
 
+    # Hydrates a page at the certain index or returns the contents of
+    # that page if it is already in the cache
+    #
+    # @param io[IO] the IO to read from
+    # @param page_i[Integer] which page (zero-based) to hydrate and return
     def hydrate_page(io, page_i)
       # Avoid trying to read the page if we know there is no content to fill it
       # in the underlying IO
       return if @lowest_known_empty_page && page_i >= @lowest_known_empty_page
 
       @pages[page_i] ||= read_page(io, page_i)
     end
 
+    # We provide an overridden implementation of #inspect to avoid
+    # printing the actual contents of the cached pages
     def inspect
-      # To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
-
       # Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
       oid_str = (object_id << 1).to_s(16).rjust(16, '0')
 
       ivars = instance_variables
       ivars.delete(:@pages)
@@ -122,13 +166,18 @@
       end.join(' ')
       synthetic_vars = 'num_hydrated_pages=%d' % @pages.length
       '#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
     end
 
+    # Reads the requested page from the given IO
+    #
+    # @param io[IO] the IO to read from
+    # @param page_i[Integer] which page (zero-based) to read
     def read_page(io, page_i)
+      FormatParser::Measurometer.increment_counter('format_parser.parser.Care.page_reads_from_upsteam', 1)
+
       io.seek(page_i * @page_size)
       read_result = io.read(@page_size)
-
       if read_result.nil?
         # If the read went past the end of the IO the read result will be nil,
         # so we know our IO is exhausted here
         if @lowest_known_empty_page.nil? || @lowest_known_empty_page > page_i
           @lowest_known_empty_page = page_i