lib/perobs/BTreeBlob.rb in perobs-2.2.0 vs lib/perobs/BTreeBlob.rb in perobs-2.3.0

- old
+ new

@@ -23,27 +23,33 @@ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +require 'zlib' module PEROBS # This class manages the usage of the data blobs in the corresponding # HashedBlobsDB object. class BTreeBlob + # Magic number used for index files. + PEROBS_MAGIC = 0xB78EEDB + # For performance reasons we use an Array for the entries instead of a # Hash. These constants specify the Array index for the corresponding # value. ID = 0 # Number of bytes BYTES = 1 # Start Address START = 2 # Mark/Unmarked flag MARKED = 3 + # CRC Checksum of the data blobA + CRC = 4 # Create a new BTreeBlob object. # @param dir [String] Fully qualified directory name # @param btreedb [BTreeDB] Reference to the DB that owns this blob def initialize(dir, btreedb) @@ -66,11 +72,12 @@ split_blob # Insert the passed object into the newly created BTree node. @btreedb.put_raw_object(raw, id) else bytes = raw.bytesize - start_address = reserve_bytes(id, bytes) + crc32 = Zlib.crc32(raw, 0) + start_address = reserve_bytes(id, bytes, crc32) if write_to_blobs_file(raw, start_address) != bytes raise RuntimeError, 'Object length does not match written bytes' end write_index end @@ -78,26 +85,20 @@ # Read the entry for the given ID and return it as bytes. # @param id [Fixnum or Bignum] ID # @return [String] sequence of bytes or nil if ID is unknown def read_object(id) - return nil unless (bytes_and_start = find(id)) - read_from_blobs_file(*bytes_and_start) + return nil unless (index_entry = find(id)) + read_from_blobs_file(index_entry) end - # Find the data for the object with given id. # @param id [Fixnum or Bignum] Object ID - # @return [Array] Returns an Array with two Fixnum entries. The first is - # the number of bytes and the second is the starting offset in the - # blob storage file. + # @return [Array] Returns an Array that represents the index entry for the + # given object. def find(id) - if (entry = @entries_by_id[id]) - return [ entry[BYTES], entry[START] ] - end - - nil + @entries_by_id[id] end # Clear the mark on all entries in the index. def clear_marks @entries.each { |e| e[MARKED] = 0 } @@ -212,27 +213,34 @@ "Cannot write blobs file #{@blobs_file_name}: #{e.message}" end end # Read _bytes_ bytes from the file starting at offset _address_. - # @param bytes [Fixnum] number of bytes to read - # @param address [Fixnum] offset in the file - def read_from_blobs_file(bytes, address) + # @param entry [Array] Index entry for the object + # @return [String] Raw bytes of the blob. + def read_from_blobs_file(entry) begin - File.read(@blobs_file_name, bytes, address) + raw = File.read(@blobs_file_name, entry[BYTES], entry[START]) rescue => e raise IOError, "Cannot read blobs file #{@blobs_file_name}: #{e.message}" end + if Zlib.crc32(raw, 0) != entry[CRC] + raise RuntimeError, + "BTreeBlob for object #{entry[ID]} has been corrupted: " + + "Checksum mismatch" + end + + raw end # Reserve the bytes needed for the specified number of bytes with the # given ID. # @param id [Fixnum or Bignum] ID of the entry # @param bytes [Fixnum] number of bytes for this entry # @return [Fixnum] the start address of the reserved blob - def reserve_bytes(id, bytes) + def reserve_bytes(id, bytes, crc32) # index of first blob after the last seen entry end_of_last_entry = 0 # blob index of best fit segment best_fit_start = nil # best fir segment size in bytes @@ -270,11 +278,11 @@ # Create a new entry and insert it. The order must match the above # defined constants! # Object reads can trigger creation of new objects. As the marking # process triggers reads as well, all newly created objects are always # marked to prevent them from being collected right after creation. - entry = [ id, bytes, best_fit_start || end_of_last_entry, 1 ] + entry = [ id, bytes, best_fit_start || end_of_last_entry, 1, crc32 ] @entries.insert(best_fit_index, entry) @entries_by_id[id] = entry entry[START] end @@ -283,21 +291,61 @@ # The entries are stored in two data structures to provide the fastest # access mechanism for each situation. The Array @entries stores them in # a plan Array. @entries_by_id stores them hashed by their ID. @entries = [] @entries_by_id = {} + entry_bytes = 29 + entry_format = 'QQQCL' + restore_crc = false if File.exists?(@index_file_name) begin File.open(@index_file_name, 'rb') do |f| - # The index is a binary format. Each entry has exactly 25 bytes. + # Since version 2.3.0, all index files start with a header. + # Earlier versions did not yet have this header. The header is 24 + # bytes long. The 2nd set of 8 bytes must be 0 to distinguish the + # header from regular entries. The first 8 bytes are a magic + # number and the 3rd 8 bytes mark the schema version. We are + # currently at version 1. + if f.size >= 24 + header = f.read(24).unpack('QQQ') + if header[0] != PEROBS_MAGIC && header[1] != 0 + # These are the settings for the pre 2.3.0 entry format. + entry_bytes = 25 + entry_format = 'QQQC' + restore_crc = true + # Rewind to start as we have an older version index file that + # has no header. + f.seek(0) + end + end + + # The index is a binary format. Each entry has exactly 29 bytes. + # Version 2.2.0 and earlier did not have the CRC field. To ensure + # backwards compatibility with older databases, we reconstruct the + # CRC for older index files and convert it to the new format on + # the next index write. + # # Bytes # 0 - 7 : 64 bits, little endian : ID # 8 - 15 : 64 bits, little endian : Entry length in bytes # 16 - 23 : 64 bits, little endian : Start address in data file # 24 : 8 bits : 0 if unmarked, 1 if marked - while (bytes = f.read(25)) - @entries << (e = bytes.unpack('QQQC')) + # 25 - 29 : 32 bits, CRC32 checksum of the data blob + while (bytes = f.read(entry_bytes)) + e = bytes.unpack(entry_format) + if restore_crc + # If the index file was written with version <= 2.2.0 we have + # to compute the CRC from the data blob. + begin + raw = File.read(@blobs_file_name, e[BYTES], e[START]) + rescue => e + raise IOError, + "Cannot read blobs file #{@blobs_file_name}: #{e.message}" + end + e[CRC] = Zlib.crc32(raw) + end + @entries << e @entries_by_id[e[ID]] = e end end rescue => e raise RuntimeError, @@ -308,12 +356,13 @@ def write_index begin File.open(@index_file_name, 'wb') do |f| # See read_index for data format documentation. + f.write([ PEROBS_MAGIC, 0, 1].pack('QQQ')) @entries.each do |entry| - f.write(entry.pack('QQQC')) + f.write(entry.pack('QQQCL')) end end rescue => e raise RuntimeError, "Cannot write BTreeBlob index file #{@index_file_name}: " + @@ -327,10 +376,10 @@ # Read all entries from the blob and re-store them into the DB. We've # already created the new BTree node, so these entries will be # distributed into new leaf blobs of this new node. @entries.each do |entry| - raw = read_from_blobs_file(entry[BYTES], entry[START]) + raw = read_from_blobs_file(entry) @btreedb.put_raw_object(raw, entry[ID]) end # Once the entries are re-stored, we can delete the old blob files. File.delete(@index_file_name + '.bak')