rcstorable.c

#
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>

#include "ruby.h"

typedef unsigned char uchar;

VALUE thaw(VALUE, VALUE);
static VALUE read_object();
static VALUE read_boolean();
static uint32_t read_32_bit_integer();
static uint32_t read_compact_size();
static void read_n_hash_pairs(VALUE, uint32_t);
static void read_n_array_entries(VALUE, uint32_t);
static VALUE read_string(bool);
static void read_magic_numbers();
static void check_pointer(uchar*);

enum perl_types
{
  PT_HASH_KEY   = 1,
  PT_ARRAY      = 2,
  PT_HASH       = 3,
  PT_VECTOR     = 4,
  PT_UNDEF      = 5,
  PT_BOOLEAN    = 8,
  PT_INT32      = 9,
  PT_STRING     = 10,
  PT_STRING_ALT = 23
};

// Used globally. Raptors. I know.
static uchar *serialized;
static uchar *serialized_end;
#

Given a perl Storable frozen blob, decode it into a ruby data structure.

VALUE
thaw(VALUE self, VALUE str)
{
  if (str == Qnil) return Qnil; // Do something logical with nil.
  
  Check_Type(str, T_STRING);
  extern uchar *serialized, *serialized_end;

  serialized = RSTRING_PTR(str);
  serialized_end = serialized + RSTRING_LEN(str);
  
  read_magic_numbers();
  
  return read_object();
}
#

Malformed strings can theoretically cause segfaults. Segfaults are bad. We’ll check pretty much everything we do against the pre-computed end-of-string.

static void
check_pointer(uchar *ptr)
{
  extern uchar *serialized_end;
  if (ptr > serialized_end) {
    rb_raise(rb_eRangeError, "malformed data");
  }
}
#

Certain test cases start with \005\006. Other ones don’t. This will need to be fixed eventually.

static void
read_magic_numbers()
{
  extern uchar *serialized;
  check_pointer(serialized+1);
  serialized += 2;
}
#

Figure out what type of object is at the front of serialized, read it in, potentially recursively creating several other sub-objects in the process, and return it.

static VALUE
read_object()
{
  extern uchar *serialized;
  check_pointer(serialized);
  uint32_t type = *serialized++;
  uint32_t size = 0;

  VALUE object = Qnil;

  switch(type) {
  case PT_UNDEF:
    object = Qnil;
    break;
  case PT_HASH:
    object = rb_hash_new();
    size = read_32_bit_integer();
    read_n_hash_pairs(object, size);
    break;
  case PT_INT32:
    object = INT2FIX(read_32_bit_integer());
    break;
  case PT_ARRAY:
    object = rb_ary_new();
    size = read_32_bit_integer();
    read_n_array_entries(object, size);
    break;
  case PT_BOOLEAN:
    object = read_boolean();
    break;
  case PT_STRING:
  case PT_STRING_ALT:
    object = read_string(false);
    break;
  case PT_VECTOR:
    object = read_object(); // This is a marker we can just ignore...
    break;
  }
  
  return object;
}

  
#

We’ve already created a hash, and read the size of it. Now we need to read in n items, and add them to the hash.

static void
read_n_hash_pairs(VALUE hash, uint32_t num)
{
  if (num == 0) { return; }
  VALUE temp = read_object();
  rb_hash_aset(hash, read_string(true), temp);
  read_n_hash_pairs(hash, num-1);
}

static VALUE
read_boolean()
{
  extern uchar *serialized;
  check_pointer(serialized);
  return (*serialized++ == 128) ? Qfalse : Qtrue;
}
#

We’ve already created an array, and read the size of it. Now we need to read in n items, and add them to the array.

static void
read_n_array_entries(VALUE array, uint32_t num)
{
  if (num == 0) { return; }
  rb_ary_push(array, read_object());
  read_n_array_entries(array, num-1);
}
#

Given a size, read in a string of that size. Note that Storable seems to use 319 as a magic value, meaning the string should be read until a very low character is found. I should test this more specifically, but it’s somewhere lower than “\t”, aka 7.

static VALUE
read_string(bool extended_size)
{
  extern uchar *serialized;
  check_pointer(serialized);

  VALUE ret;
  uint32_t size = extended_size ? read_32_bit_integer() : read_compact_size();
  uint32_t actual_size = 0;
  uchar *tp = serialized;

  if (size == 319) { // apparently Storable uses \000\000\001\077 to mean "read until n<7"
    while (*tp++ >= 7) {
      check_pointer(tp);
      actual_size++;
    }
    size = actual_size;
  }
  
  uchar *np = malloc(size+1);
  check_pointer(serialized+size-1);
  memcpy(np, serialized, size);
  serialized += size;
  
  ret = rb_str_new(np, size);
  free(np);
  return ret;
}
#

Extended sizes are given as [w,x,y,z], where the size is 256*y + z. This should really be read as a uint_32t, I guess.

static uint32_t
read_32_bit_integer()
{
  extern uchar *serialized;

  uint32_t size = 0;

  check_pointer(serialized+3);

  // I don't want to deal with byte-order. This is just easier. 
  size += (*serialized++)*16777216;
  size += (*serialized++)*65536;
  size += (*serialized++)*256;
  size += (*serialized++);

  return size;
}
#

Just one byte.

static uint32_t
read_compact_size() {
  extern uchar *serialized;
  check_pointer(serialized);
  return *serialized++;
}


void
Init_rcstorable()
{
  VALUE mRcstorable = rb_define_module("RCStorable");
  rb_define_singleton_method(mRcstorable, "thaw", thaw, 1);
}