lib/care.rb in format_parser-0.7.0 vs lib/care.rb in format_parser-0.8.0

- old
+ new

@@ -2,53 +2,83 @@ # possibly remote IO to parsers that tend to read (and skip) # in very small increments. This way, with a remote source that # is only available via HTTP, for example, we can have less # fetches and have them return more data for one fetch class Care + # Defines the size of a page in bytes that the Care will prefetch DEFAULT_PAGE_SIZE = 128 * 1024 + # Wraps any given IO with Care caching superpowers. Supports the subset + # of IO declared in IOConstraint. class IOWrapper + # Creates a new IOWrapper around the given source IO + # + # @param io[#seek, #pos, #size] the IO to wrap + # @param page_size[Integer] the size of the cache page to use for this wrapper def initialize(io, page_size: DEFAULT_PAGE_SIZE) @cache = Cache.new(page_size) @io = io @pos = 0 end + # Returns the size of the resource contained in the IO + # + # @return Integer def size @io.size end + # Seeks the IO to the given absolute offset from the start of the file/resource + # + # @param to[Integer] offset in the IO + # @return Integer def seek(to) @pos = to end + # Returns the current position/offset within the IO + # + # @return Integer def pos @pos end + # Returns at most `n_bytes` of data from the IO or less if less data was available + # before the EOF was hit + # + # @param n_bytes[Integer] + # @return [String, nil] the content read from the IO or `nil` if no data was available def read(n_bytes) return '' if n_bytes == 0 # As hardcoded for all Ruby IO objects raise ArgumentError, "negative length #{n_bytes} given" if n_bytes < 0 # also as per Ruby IO objects read = @cache.byteslice(@io, @pos, n_bytes) return unless read && !read.empty? @pos += read.bytesize read end + # Clears all the cached pages explicitly to help GC + # + # @return void def clear @cache.clear end + # Clears all the cached pages explicitly to help GC, and + # calls `#close` on the source IO if the IO responds to `#close` + # + # @return void def close clear @io.close if @io.respond_to?(:close) end end # Stores cached pages of data from the given IO as strings. # Pages are sized to be `page_size` or less (for the last page). class Cache + # Initializes a new cache pages container with pages of given size def initialize(page_size = DEFAULT_PAGE_SIZE) @page_size = page_size.to_i raise ArgumentError, 'The page size must be a positive Integer' unless @page_size > 0 @pages = {} @lowest_known_empty_page = nil @@ -57,10 +87,16 @@ # Returns the maximum possible byte string that can be # recovered from the given `io` at the given offset. # If the IO has been exhausted, `nil` will be returned # instead. Will use the cached pages where available, # or fetch pages where necessary + # + # @param io[#seek, #read] the IO to read data from + # @param at[Integer] at which offset we have to read + # @param n_bytes[Integer] how many bytes we want to read/cache + # @return [String, nil] the content read from the IO or `nil` if no data was available + # @raise ArgumentError def byteslice(io, at, n_bytes) if n_bytes < 1 raise ArgumentError, "The number of bytes to fetch must be a positive Integer, but was #{n_bytes}" end if at < 0 @@ -95,25 +131,33 @@ # Returning an empty string from read() is very confusing for the caller, # and no builtins do this - if we are at EOF we should return nil slice if slice && !slice.empty? end + # Clears the page cache of all strings with data + # + # @return void def clear @pages.clear end + # Hydrates a page at the certain index or returns the contents of + # that page if it is already in the cache + # + # @param io[IO] the IO to read from + # @param page_i[Integer] which page (zero-based) to hydrate and return def hydrate_page(io, page_i) # Avoid trying to read the page if we know there is no content to fill it # in the underlying IO return if @lowest_known_empty_page && page_i >= @lowest_known_empty_page @pages[page_i] ||= read_page(io, page_i) end + # We provide an overridden implementation of #inspect to avoid + # printing the actual contents of the cached pages def inspect - # To avoid page _contents_ in the inspect outputs we need to implement our own inspect. - # Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886 oid_str = (object_id << 1).to_s(16).rjust(16, '0') ivars = instance_variables ivars.delete(:@pages) @@ -122,13 +166,18 @@ end.join(' ') synthetic_vars = 'num_hydrated_pages=%d' % @pages.length '#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str] end + # Reads the requested page from the given IO + # + # @param io[IO] the IO to read from + # @param page_i[Integer] which page (zero-based) to read def read_page(io, page_i) + FormatParser::Measurometer.increment_counter('format_parser.parser.Care.page_reads_from_upsteam', 1) + io.seek(page_i * @page_size) read_result = io.read(@page_size) - if read_result.nil? # If the read went past the end of the IO the read result will be nil, # so we know our IO is exhausted here if @lowest_known_empty_page.nil? || @lowest_known_empty_page > page_i @lowest_known_empty_page = page_i