simdjson.cpp in fast_jsonparser-0.3.0

- old
+ new

@@ -1,14 +1,48 @@
-/* auto-generated on Thu  2 Apr 2020 18:58:25 EDT. Do not edit! */
+/* auto-generated on Mon Jul  6 18:16:52 EDT 2020. Do not edit! */
+/* begin file src/simdjson.cpp */
 #include "simdjson.h"
 
-/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
-#ifdef DMALLOC
-#include "dmalloc.h"
-#endif
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_UNDESIRED_WARNINGS
 
-/* begin file src/simdjson.cpp */
+/* begin file src/error.cpp */
+
+namespace simdjson
+{
+  namespace internal
+  {
+
+    SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[]{
+        {SUCCESS, "No error"},
+        {CAPACITY, "This parser can't support a document that big"},
+        {MEMALLOC, "Error allocating memory, we're most likely out of memory"},
+        {TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc."},
+        {DEPTH_ERROR, "The JSON document was too deep (too many nested objects and arrays)"},
+        {STRING_ERROR, "Problem while parsing a string"},
+        {T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'"},
+        {F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'"},
+        {N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'"},
+        {NUMBER_ERROR, "Problem while parsing a number"},
+        {UTF8_ERROR, "The input is not valid UTF-8"},
+        {UNINITIALIZED, "Uninitialized"},
+        {EMPTY, "Empty: no JSON found"},
+        {UNESCAPED_CHARS, "Within strings, some characters must be escaped, we found unescaped characters"},
+        {UNCLOSED_STRING, "A string is opened, but never closed."},
+        {UNSUPPORTED_ARCHITECTURE, "simdjson does not have an implementation supported by this CPU architecture (perhaps it's a non-SIMD CPU?)."},
+        {INCORRECT_TYPE, "The JSON element does not have the requested type."},
+        {NUMBER_OUT_OF_RANGE, "The JSON number is too large or too small to fit within the requested type."},
+        {INDEX_OUT_OF_BOUNDS, "Attempted to access an element of a JSON array that is beyond its length."},
+        {NO_SUCH_FIELD, "The JSON field referenced does not exist in this object."},
+        {IO_ERROR, "Error reading the file."},
+        {INVALID_JSON_POINTER, "Invalid JSON pointer syntax."},
+        {INVALID_URI_FRAGMENT, "Invalid URI fragment syntax."},
+        {UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson"}}; // error_messages[]
+
+  } // namespace internal
+} // namespace simdjson
+/* end file src/error.cpp */
 /* begin file src/implementation.cpp */
 /* begin file src/isadetection.h */
 /* From
 https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
 Highly modified.
@@ -63,238 +97,567 @@
 #include <intrin.h>
 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
 #include <cpuid.h>
 #endif
 
-namespace simdjson {
+namespace simdjson
+{
 
-// Can be found on Intel ISA Reference for CPUID
-constexpr uint32_t cpuid_avx2_bit = 1 << 5;      ///< @private Bit 5 of EBX for EAX=0x7
-constexpr uint32_t cpuid_bmi1_bit = 1 << 3;      ///< @private bit 3 of EBX for EAX=0x7
-constexpr uint32_t cpuid_bmi2_bit = 1 << 8;      ///< @private bit 8 of EBX for EAX=0x7
-constexpr uint32_t cpuid_sse42_bit = 1 << 20;    ///< @private bit 20 of ECX for EAX=0x1
-constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit  1 of ECX for EAX=0x1
+  enum instruction_set
+  {
+    DEFAULT = 0x0,
+    NEON = 0x1,
+    AVX2 = 0x4,
+    SSE42 = 0x8,
+    PCLMULQDQ = 0x10,
+    BMI1 = 0x20,
+    BMI2 = 0x40
+  };
 
-enum instruction_set {
-  DEFAULT = 0x0,
-  NEON = 0x1,
-  AVX2 = 0x4,
-  SSE42 = 0x8,
-  PCLMULQDQ = 0x10,
-  BMI1 = 0x20,
-  BMI2 = 0x40
-};
-
 #if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
 
 #if defined(__ARM_NEON)
 
-static inline uint32_t detect_supported_architectures() {
-  return instruction_set::NEON;
-}
+  static inline uint32_t detect_supported_architectures()
+  {
+    return instruction_set::NEON;
+  }
 
 #else // ARM without NEON
 
-static inline uint32_t detect_supported_architectures() {
-  return instruction_set::DEFAULT;
-}
+  static inline uint32_t detect_supported_architectures()
+  {
+    return instruction_set::DEFAULT;
+  }
 
 #endif
 
-#else // x86
-static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
-                         uint32_t *edx) {
+#elif defined(__x86_64__) || defined(_M_AMD64) // x64
+
+  namespace
+  {
+    // Can be found on Intel ISA Reference for CPUID
+    constexpr uint32_t cpuid_avx2_bit = 1 << 5;      ///< @private Bit 5 of EBX for EAX=0x7
+    constexpr uint32_t cpuid_bmi1_bit = 1 << 3;      ///< @private bit 3 of EBX for EAX=0x7
+    constexpr uint32_t cpuid_bmi2_bit = 1 << 8;      ///< @private bit 8 of EBX for EAX=0x7
+    constexpr uint32_t cpuid_sse42_bit = 1 << 20;    ///< @private bit 20 of ECX for EAX=0x1
+    constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit  1 of ECX for EAX=0x1
+  }                                                  // namespace
+
+  static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+                           uint32_t *edx)
+  {
 #if defined(_MSC_VER)
-  int cpu_info[4];
-  __cpuid(cpu_info, *eax);
-  *eax = cpu_info[0];
-  *ebx = cpu_info[1];
-  *ecx = cpu_info[2];
-  *edx = cpu_info[3];
+    int cpu_info[4];
+    __cpuid(cpu_info, *eax);
+    *eax = cpu_info[0];
+    *ebx = cpu_info[1];
+    *ecx = cpu_info[2];
+    *edx = cpu_info[3];
 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
-  uint32_t level = *eax;
-  __get_cpuid(level, eax, ebx, ecx, edx);
+    uint32_t level = *eax;
+    __get_cpuid(level, eax, ebx, ecx, edx);
 #else
-  uint32_t a = *eax, b, c = *ecx, d;
-  asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
-  *eax = a;
-  *ebx = b;
-  *ecx = c;
-  *edx = d;
+    uint32_t a = *eax, b, c = *ecx, d;
+    asm volatile("cpuid\n\t"
+                 : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+    *eax = a;
+    *ebx = b;
+    *ecx = c;
+    *edx = d;
 #endif
-}
+  }
 
-static inline uint32_t detect_supported_architectures() {
-  uint32_t eax, ebx, ecx, edx;
-  uint32_t host_isa = 0x0;
+  static inline uint32_t detect_supported_architectures()
+  {
+    uint32_t eax, ebx, ecx, edx;
+    uint32_t host_isa = 0x0;
 
-  // ECX for EAX=0x7
-  eax = 0x7;
-  ecx = 0x0;
-  cpuid(&eax, &ebx, &ecx, &edx);
-  if (ebx & cpuid_avx2_bit) {
-    host_isa |= instruction_set::AVX2;
-  }
-  if (ebx & cpuid_bmi1_bit) {
-    host_isa |= instruction_set::BMI1;
-  }
+    // ECX for EAX=0x7
+    eax = 0x7;
+    ecx = 0x0;
+    cpuid(&eax, &ebx, &ecx, &edx);
+    if (ebx & cpuid_avx2_bit)
+    {
+      host_isa |= instruction_set::AVX2;
+    }
+    if (ebx & cpuid_bmi1_bit)
+    {
+      host_isa |= instruction_set::BMI1;
+    }
 
-  if (ebx & cpuid_bmi2_bit) {
-    host_isa |= instruction_set::BMI2;
-  }
+    if (ebx & cpuid_bmi2_bit)
+    {
+      host_isa |= instruction_set::BMI2;
+    }
 
-  // EBX for EAX=0x1
-  eax = 0x1;
-  cpuid(&eax, &ebx, &ecx, &edx);
+    // EBX for EAX=0x1
+    eax = 0x1;
+    cpuid(&eax, &ebx, &ecx, &edx);
 
-  if (ecx & cpuid_sse42_bit) {
-    host_isa |= instruction_set::SSE42;
+    if (ecx & cpuid_sse42_bit)
+    {
+      host_isa |= instruction_set::SSE42;
+    }
+
+    if (ecx & cpuid_pclmulqdq_bit)
+    {
+      host_isa |= instruction_set::PCLMULQDQ;
+    }
+
+    return host_isa;
   }
+#else // fallback
 
-  if (ecx & cpuid_pclmulqdq_bit) {
-    host_isa |= instruction_set::PCLMULQDQ;
+  static inline uint32_t detect_supported_architectures()
+  {
+    return instruction_set::DEFAULT;
   }
 
-  return host_isa;
-}
-
 #endif // end SIMD extension detection code
 
-} // namespace simdjson::internal
+} // namespace simdjson
 
 #endif // SIMDJSON_ISADETECTION_H
 /* end file src/isadetection.h */
 /* begin file src/simdprune_tables.h */
 #ifndef SIMDJSON_SIMDPRUNE_TABLES_H
 #define SIMDJSON_SIMDPRUNE_TABLES_H
+
+#if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE
+
 #include <cstdint>
 
-namespace simdjson { // table modified and copied from
-                     // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
-static const unsigned char BitsSetTable256mul2[256] = {
-    0,  2,  2,  4,  2,  4,  4,  6,  2,  4,  4,  6,  4,  6,  6,  8,  2,  4,  4,
-    6,  4,  6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 2,  4,  4,  6,  4,  6,
-    6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,
-    8,  8,  10, 8,  10, 10, 12, 2,  4,  4,  6,  4,  6,  6,  8,  4,  6,  6,  8,
-    6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10,
-    12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,  8,
-    8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 2,  4,  4,  6,  4,
-    6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10,
-    6,  8,  8,  10, 8,  10, 10, 12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,
-    10, 8,  10, 10, 12, 6,  8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12,
-    12, 14, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,
-    8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 6,  8,  8,  10,
-    8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 8,  10, 10, 12, 10, 12, 12,
-    14, 10, 12, 12, 14, 12, 14, 14, 16};
+namespace simdjson
+{ // table modified and copied from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
+  static const unsigned char BitsSetTable256mul2[256] = {
+      0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4,
+      6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6,
+      6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6,
+      8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8,
+      6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10,
+      12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8,
+      8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4,
+      6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10,
+      6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8,
+      10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12,
+      12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6,
+      8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10,
+      8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12,
+      14, 10, 12, 12, 14, 12, 14, 14, 16};
 
-static const uint8_t pshufb_combine_table[272] = {
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
-    0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
-    0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x00, 0x01, 0x02, 0x03,
-    0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80,
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
-    0x0f, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
-    0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x08,
-    0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
-    0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x09, 0x0a, 0x0b,
-    0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-};
+  static const uint8_t pshufb_combine_table[272] = {
+      0x00,
+      0x01,
+      0x02,
+      0x03,
+      0x04,
+      0x05,
+      0x06,
+      0x07,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x00,
+      0x01,
+      0x02,
+      0x03,
+      0x04,
+      0x05,
+      0x06,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x00,
+      0x01,
+      0x02,
+      0x03,
+      0x04,
+      0x05,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x80,
+      0x00,
+      0x01,
+      0x02,
+      0x03,
+      0x04,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x80,
+      0x80,
+      0x00,
+      0x01,
+      0x02,
+      0x03,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x00,
+      0x01,
+      0x02,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x00,
+      0x01,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x00,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x08,
+      0x09,
+      0x0a,
+      0x0b,
+      0x0c,
+      0x0d,
+      0x0e,
+      0x0f,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+      0x80,
+  };
 
-// 256 * 8 bytes = 2kB, easily fits in cache.
-static const uint64_t thintable_epi8[256] = {
-    0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
-    0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
-    0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
-    0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
-    0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
-    0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
-    0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
-    0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
-    0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
-    0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
-    0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
-    0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
-    0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
-    0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
-    0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
-    0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
-    0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
-    0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
-    0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
-    0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
-    0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
-    0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
-    0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
-    0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
-    0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
-    0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
-    0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
-    0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
-    0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
-    0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
-    0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
-    0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
-    0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
-    0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
-    0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
-    0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
-    0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
-    0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
-    0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
-    0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
-    0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
-    0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
-    0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
-    0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
-    0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
-    0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
-    0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
-    0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
-    0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
-    0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
-    0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
-    0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
-    0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
-    0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
-    0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
-    0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
-    0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
-    0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
-    0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
-    0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
-    0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
-    0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
-    0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
-    0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
-    0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
-    0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
-    0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
-    0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
-    0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
-    0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
-    0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
-    0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
-    0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
-    0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
-    0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
-    0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
-    0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
-    0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
-    0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
-    0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
-    0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
-    0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
-    0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
-    0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
-    0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
-    0x0000000000000000,
-}; //static uint64_t thintable_epi8[256]
+  // 256 * 8 bytes = 2kB, easily fits in cache.
+  static const uint64_t thintable_epi8[256] = {
+      0x0706050403020100,
+      0x0007060504030201,
+      0x0007060504030200,
+      0x0000070605040302,
+      0x0007060504030100,
+      0x0000070605040301,
+      0x0000070605040300,
+      0x0000000706050403,
+      0x0007060504020100,
+      0x0000070605040201,
+      0x0000070605040200,
+      0x0000000706050402,
+      0x0000070605040100,
+      0x0000000706050401,
+      0x0000000706050400,
+      0x0000000007060504,
+      0x0007060503020100,
+      0x0000070605030201,
+      0x0000070605030200,
+      0x0000000706050302,
+      0x0000070605030100,
+      0x0000000706050301,
+      0x0000000706050300,
+      0x0000000007060503,
+      0x0000070605020100,
+      0x0000000706050201,
+      0x0000000706050200,
+      0x0000000007060502,
+      0x0000000706050100,
+      0x0000000007060501,
+      0x0000000007060500,
+      0x0000000000070605,
+      0x0007060403020100,
+      0x0000070604030201,
+      0x0000070604030200,
+      0x0000000706040302,
+      0x0000070604030100,
+      0x0000000706040301,
+      0x0000000706040300,
+      0x0000000007060403,
+      0x0000070604020100,
+      0x0000000706040201,
+      0x0000000706040200,
+      0x0000000007060402,
+      0x0000000706040100,
+      0x0000000007060401,
+      0x0000000007060400,
+      0x0000000000070604,
+      0x0000070603020100,
+      0x0000000706030201,
+      0x0000000706030200,
+      0x0000000007060302,
+      0x0000000706030100,
+      0x0000000007060301,
+      0x0000000007060300,
+      0x0000000000070603,
+      0x0000000706020100,
+      0x0000000007060201,
+      0x0000000007060200,
+      0x0000000000070602,
+      0x0000000007060100,
+      0x0000000000070601,
+      0x0000000000070600,
+      0x0000000000000706,
+      0x0007050403020100,
+      0x0000070504030201,
+      0x0000070504030200,
+      0x0000000705040302,
+      0x0000070504030100,
+      0x0000000705040301,
+      0x0000000705040300,
+      0x0000000007050403,
+      0x0000070504020100,
+      0x0000000705040201,
+      0x0000000705040200,
+      0x0000000007050402,
+      0x0000000705040100,
+      0x0000000007050401,
+      0x0000000007050400,
+      0x0000000000070504,
+      0x0000070503020100,
+      0x0000000705030201,
+      0x0000000705030200,
+      0x0000000007050302,
+      0x0000000705030100,
+      0x0000000007050301,
+      0x0000000007050300,
+      0x0000000000070503,
+      0x0000000705020100,
+      0x0000000007050201,
+      0x0000000007050200,
+      0x0000000000070502,
+      0x0000000007050100,
+      0x0000000000070501,
+      0x0000000000070500,
+      0x0000000000000705,
+      0x0000070403020100,
+      0x0000000704030201,
+      0x0000000704030200,
+      0x0000000007040302,
+      0x0000000704030100,
+      0x0000000007040301,
+      0x0000000007040300,
+      0x0000000000070403,
+      0x0000000704020100,
+      0x0000000007040201,
+      0x0000000007040200,
+      0x0000000000070402,
+      0x0000000007040100,
+      0x0000000000070401,
+      0x0000000000070400,
+      0x0000000000000704,
+      0x0000000703020100,
+      0x0000000007030201,
+      0x0000000007030200,
+      0x0000000000070302,
+      0x0000000007030100,
+      0x0000000000070301,
+      0x0000000000070300,
+      0x0000000000000703,
+      0x0000000007020100,
+      0x0000000000070201,
+      0x0000000000070200,
+      0x0000000000000702,
+      0x0000000000070100,
+      0x0000000000000701,
+      0x0000000000000700,
+      0x0000000000000007,
+      0x0006050403020100,
+      0x0000060504030201,
+      0x0000060504030200,
+      0x0000000605040302,
+      0x0000060504030100,
+      0x0000000605040301,
+      0x0000000605040300,
+      0x0000000006050403,
+      0x0000060504020100,
+      0x0000000605040201,
+      0x0000000605040200,
+      0x0000000006050402,
+      0x0000000605040100,
+      0x0000000006050401,
+      0x0000000006050400,
+      0x0000000000060504,
+      0x0000060503020100,
+      0x0000000605030201,
+      0x0000000605030200,
+      0x0000000006050302,
+      0x0000000605030100,
+      0x0000000006050301,
+      0x0000000006050300,
+      0x0000000000060503,
+      0x0000000605020100,
+      0x0000000006050201,
+      0x0000000006050200,
+      0x0000000000060502,
+      0x0000000006050100,
+      0x0000000000060501,
+      0x0000000000060500,
+      0x0000000000000605,
+      0x0000060403020100,
+      0x0000000604030201,
+      0x0000000604030200,
+      0x0000000006040302,
+      0x0000000604030100,
+      0x0000000006040301,
+      0x0000000006040300,
+      0x0000000000060403,
+      0x0000000604020100,
+      0x0000000006040201,
+      0x0000000006040200,
+      0x0000000000060402,
+      0x0000000006040100,
+      0x0000000000060401,
+      0x0000000000060400,
+      0x0000000000000604,
+      0x0000000603020100,
+      0x0000000006030201,
+      0x0000000006030200,
+      0x0000000000060302,
+      0x0000000006030100,
+      0x0000000000060301,
+      0x0000000000060300,
+      0x0000000000000603,
+      0x0000000006020100,
+      0x0000000000060201,
+      0x0000000000060200,
+      0x0000000000000602,
+      0x0000000000060100,
+      0x0000000000000601,
+      0x0000000000000600,
+      0x0000000000000006,
+      0x0000050403020100,
+      0x0000000504030201,
+      0x0000000504030200,
+      0x0000000005040302,
+      0x0000000504030100,
+      0x0000000005040301,
+      0x0000000005040300,
+      0x0000000000050403,
+      0x0000000504020100,
+      0x0000000005040201,
+      0x0000000005040200,
+      0x0000000000050402,
+      0x0000000005040100,
+      0x0000000000050401,
+      0x0000000000050400,
+      0x0000000000000504,
+      0x0000000503020100,
+      0x0000000005030201,
+      0x0000000005030200,
+      0x0000000000050302,
+      0x0000000005030100,
+      0x0000000000050301,
+      0x0000000000050300,
+      0x0000000000000503,
+      0x0000000005020100,
+      0x0000000000050201,
+      0x0000000000050200,
+      0x0000000000000502,
+      0x0000000000050100,
+      0x0000000000000501,
+      0x0000000000000500,
+      0x0000000000000005,
+      0x0000000403020100,
+      0x0000000004030201,
+      0x0000000004030200,
+      0x0000000000040302,
+      0x0000000004030100,
+      0x0000000000040301,
+      0x0000000000040300,
+      0x0000000000000403,
+      0x0000000004020100,
+      0x0000000000040201,
+      0x0000000000040200,
+      0x0000000000000402,
+      0x0000000000040100,
+      0x0000000000000401,
+      0x0000000000000400,
+      0x0000000000000004,
+      0x0000000003020100,
+      0x0000000000030201,
+      0x0000000000030200,
+      0x0000000000000302,
+      0x0000000000030100,
+      0x0000000000000301,
+      0x0000000000000300,
+      0x0000000000000003,
+      0x0000000000020100,
+      0x0000000000000201,
+      0x0000000000000200,
+      0x0000000000000002,
+      0x0000000000000100,
+      0x0000000000000001,
+      0x0000000000000000,
+      0x0000000000000000,
+  }; //static uint64_t thintable_epi8[256]
 
-} // namespace simdjson 
+} // namespace simdjson
 
+#endif //  SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE
 #endif // SIMDJSON_SIMDPRUNE_TABLES_H
 /* end file src/simdprune_tables.h */
 
 #include <initializer_list>
 
@@ -306,243 +669,1590 @@
 #ifndef SIMDJSON_HASWELL_IMPLEMENTATION_H
 #define SIMDJSON_HASWELL_IMPLEMENTATION_H
 
 /* isadetection.h already included: #include "isadetection.h" */
 
-namespace simdjson::haswell {
+namespace simdjson
+{
+  namespace haswell
+  {
 
-using namespace simdjson::dom;
+    class implementation final : public simdjson::implementation
+    {
+    public:
+      really_inline implementation() : simdjson::implementation(
+                                           "haswell",
+                                           "Intel/AMD AVX2",
+                                           instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2) {}
+      WARN_UNUSED error_code create_dom_parser_implementation(
+          size_t capacity,
+          size_t max_length,
+          std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept final;
+      WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+      WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
+    };
 
-class implementation final : public simdjson::implementation {
-public:
-  really_inline implementation() : simdjson::implementation(
-      "haswell",
-      "Intel/AMD AVX2",
-      instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2
-  ) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
-};
+  } // namespace haswell
+} // namespace simdjson
 
-} // namespace simdjson::haswell
-
 #endif // SIMDJSON_HASWELL_IMPLEMENTATION_H
 /* end file src/haswell/implementation.h */
-namespace simdjson::internal { const haswell::implementation haswell_singleton{}; }
+namespace simdjson
+{
+  namespace internal
+  {
+    const haswell::implementation haswell_singleton{};
+  }
+} // namespace simdjson
 #endif // SIMDJSON_IMPLEMENTATION_HASWELL
 
 #if SIMDJSON_IMPLEMENTATION_WESTMERE
 /* begin file src/westmere/implementation.h */
 #ifndef SIMDJSON_WESTMERE_IMPLEMENTATION_H
 #define SIMDJSON_WESTMERE_IMPLEMENTATION_H
 
 /* isadetection.h already included: #include "isadetection.h" */
 
-namespace simdjson::westmere {
+namespace simdjson
+{
+  namespace westmere
+  {
 
-using namespace simdjson::dom;
+    using namespace simdjson::dom;
 
-class implementation final : public simdjson::implementation {
-public:
-  really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
-};
+    class implementation final : public simdjson::implementation
+    {
+    public:
+      really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {}
+      WARN_UNUSED error_code create_dom_parser_implementation(
+          size_t capacity,
+          size_t max_length,
+          std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept final;
+      WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+      WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
+    };
 
-} // namespace simdjson::westmere
+  } // namespace westmere
+} // namespace simdjson
 
 #endif // SIMDJSON_WESTMERE_IMPLEMENTATION_H
 /* end file src/westmere/implementation.h */
-namespace simdjson::internal { const westmere::implementation westmere_singleton{}; }
+namespace simdjson
+{
+  namespace internal
+  {
+    const westmere::implementation westmere_singleton{};
+  }
+} // namespace simdjson
 #endif // SIMDJSON_IMPLEMENTATION_WESTMERE
 
 #if SIMDJSON_IMPLEMENTATION_ARM64
 /* begin file src/arm64/implementation.h */
 #ifndef SIMDJSON_ARM64_IMPLEMENTATION_H
 #define SIMDJSON_ARM64_IMPLEMENTATION_H
 
 /* isadetection.h already included: #include "isadetection.h" */
 
-namespace simdjson::arm64 {
+namespace simdjson
+{
+  namespace arm64
+  {
 
-using namespace simdjson::dom;
+    using namespace simdjson::dom;
 
-class implementation final : public simdjson::implementation {
-public:
-  really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
-};
+    class implementation final : public simdjson::implementation
+    {
+    public:
+      really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {}
+      WARN_UNUSED error_code create_dom_parser_implementation(
+          size_t capacity,
+          size_t max_length,
+          std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept final;
+      WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+      WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
+    };
 
-} // namespace simdjson::arm64
+  } // namespace arm64
+} // namespace simdjson
 
 #endif // SIMDJSON_ARM64_IMPLEMENTATION_H
 /* end file src/arm64/implementation.h */
-namespace simdjson::internal { const arm64::implementation arm64_singleton{}; }
+namespace simdjson
+{
+  namespace internal
+  {
+    const arm64::implementation arm64_singleton{};
+  }
+} // namespace simdjson
 #endif // SIMDJSON_IMPLEMENTATION_ARM64
 
 #if SIMDJSON_IMPLEMENTATION_FALLBACK
 /* begin file src/fallback/implementation.h */
 #ifndef SIMDJSON_FALLBACK_IMPLEMENTATION_H
 #define SIMDJSON_FALLBACK_IMPLEMENTATION_H
 
 /* isadetection.h already included: #include "isadetection.h" */
 
-namespace simdjson::fallback {
+namespace simdjson
+{
+  namespace fallback
+  {
 
-using namespace simdjson::dom;
+    using namespace simdjson::dom;
 
-class implementation final : public simdjson::implementation {
-public:
-  really_inline implementation() : simdjson::implementation(
-      "fallback",
-      "Generic fallback implementation",
-      0
-  ) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
-};
+    class implementation final : public simdjson::implementation
+    {
+    public:
+      really_inline implementation() : simdjson::implementation(
+                                           "fallback",
+                                           "Generic fallback implementation",
+                                           0) {}
+      WARN_UNUSED error_code create_dom_parser_implementation(
+          size_t capacity,
+          size_t max_length,
+          std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept final;
+      WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+      WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
+    };
 
-} // namespace simdjson::fallback
+  } // namespace fallback
 
+} // namespace simdjson
+
 #endif // SIMDJSON_FALLBACK_IMPLEMENTATION_H
 /* end file src/fallback/implementation.h */
-namespace simdjson::internal { const fallback::implementation fallback_singleton{}; }
+namespace simdjson
+{
+  namespace internal
+  {
+    const fallback::implementation fallback_singleton{};
+  }
+} // namespace simdjson
 #endif // SIMDJSON_IMPLEMENTATION_FALLBACK
 
-namespace simdjson::internal {
+namespace simdjson
+{
+  namespace internal
+  {
 
-constexpr const std::initializer_list<const implementation *> available_implementation_pointers {
+    /**
+ * @private Detects best supported implementation on first use, and sets it
+ */
+    class detect_best_supported_implementation_on_first_use final : public implementation
+    {
+    public:
+      const std::string &name() const noexcept final { return set_best()->name(); }
+      const std::string &description() const noexcept final { return set_best()->description(); }
+      uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+      WARN_UNUSED error_code create_dom_parser_implementation(
+          size_t capacity,
+          size_t max_length,
+          std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept final
+      {
+        return set_best()->create_dom_parser_implementation(capacity, max_length, dst);
+      }
+      WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final
+      {
+        return set_best()->minify(buf, len, dst, dst_len);
+      }
+      WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final override
+      {
+        return set_best()->validate_utf8(buf, len);
+      }
+      really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
+
+    private:
+      const implementation *set_best() const noexcept;
+    };
+
+    const detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
+
+    internal::atomic_ptr<const implementation> active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton};
+
+    const std::initializer_list<const implementation *> available_implementation_pointers
+    {
 #if SIMDJSON_IMPLEMENTATION_HASWELL
-  &haswell_singleton,
+      &haswell_singleton,
 #endif
 #if SIMDJSON_IMPLEMENTATION_WESTMERE
-  &westmere_singleton,
+          &westmere_singleton,
 #endif
 #if SIMDJSON_IMPLEMENTATION_ARM64
-  &arm64_singleton,
+          &arm64_singleton,
 #endif
 #if SIMDJSON_IMPLEMENTATION_FALLBACK
-  &fallback_singleton,
+          &fallback_singleton,
 #endif
-}; // available_implementation_pointers
+    }; // available_implementation_pointers
 
-// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
-class unsupported_implementation final : public implementation {
-public:
-  WARN_UNUSED error_code parse(const uint8_t *, size_t, parser &) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
+    // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
+    class unsupported_implementation final : public implementation
+    {
+    public:
+      WARN_UNUSED error_code create_dom_parser_implementation(
+          size_t,
+          size_t,
+          std::unique_ptr<internal::dom_parser_implementation> &) const noexcept final
+      {
+        return UNSUPPORTED_ARCHITECTURE;
+      }
+      WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override
+      {
+        return UNSUPPORTED_ARCHITECTURE;
+      }
+      WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override
+      {
+        return false; // Just refuse to validate. Given that we have a fallback implementation
+        // it seems unlikely that unsupported_implementation will ever be used. If it is used,
+        // then it will flag all strings as invalid. The alternative is to return an error_code
+        // from which the user has to figure out whether the string is valid UTF-8... which seems
+        // like a lot of work just to handle the very unlikely case that we have an unsupported
+        // implementation. And, when it does happen (that we have an unsupported implementation),
+        // what are the chances that the programmer has a fallback? Given that *we* provide the
+        // fallback, it implies that the programmer would need a fallback for our fallback.
+      }
+      unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
+    };
+
+    const unsupported_implementation unsupported_singleton{};
+
+    size_t available_implementation_list::size() const noexcept
+    {
+      return internal::available_implementation_pointers.size();
+    }
+    const implementation *const *available_implementation_list::begin() const noexcept
+    {
+      return internal::available_implementation_pointers.begin();
+    }
+    const implementation *const *available_implementation_list::end() const noexcept
+    {
+      return internal::available_implementation_pointers.end();
+    }
+    const implementation *available_implementation_list::detect_best_supported() const noexcept
+    {
+      // They are prelisted in priority order, so we just go down the list
+      uint32_t supported_instruction_sets = detect_supported_architectures();
+      for (const implementation *impl : internal::available_implementation_pointers)
+      {
+        uint32_t required_instruction_sets = impl->required_instruction_sets();
+        if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets)
+        {
+          return impl;
+        }
+      }
+      return &unsupported_singleton; // this should never happen?
+    }
+
+    const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept
+    {
+      SIMDJSON_PUSH_DISABLE_WARNINGS
+      SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+          char *force_implementation_name = getenv("SIMDJSON_FORCE_IMPLEMENTATION");
+      SIMDJSON_POP_DISABLE_WARNINGS
+
+      if (force_implementation_name)
+      {
+        auto force_implementation = available_implementations[force_implementation_name];
+        if (force_implementation)
+        {
+          return active_implementation = force_implementation;
+        }
+        else
+        {
+          // Note: abort() and stderr usage within the library is forbidden.
+          return active_implementation = &unsupported_singleton;
+        }
+      }
+      return active_implementation = available_implementations.detect_best_supported();
+    }
+
+  } // namespace internal
+
+  SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{};
+  SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton};
+
+  WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept
+  {
+    return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len);
   }
-  WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
+  WARN_UNUSED bool validate_utf8(const char *buf, size_t len) noexcept
+  {
+    return active_implementation->validate_utf8(buf, len);
   }
-  WARN_UNUSED error_code stage1(const uint8_t *, size_t, parser &, bool) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
+
+} // namespace simdjson
+/* end file src/fallback/implementation.h */
+
+// Anything in the top level directory MUST be included outside of the #if statements
+// below, or amalgamation will screw them up!
+/* isadetection.h already included: #include "isadetection.h" */
+/* begin file src/jsoncharutils.h */
+#ifndef SIMDJSON_JSONCHARUTILS_H
+#define SIMDJSON_JSONCHARUTILS_H
+
+#ifdef JSON_TEST_STRINGS
+void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
+                  const uint8_t *parsed_end);
+void found_bad_string(const uint8_t *buf);
+#endif
+
+namespace simdjson
+{
+  // structural chars here are
+  // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
+  // we are also interested in the four whitespace characters
+  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
+
+  // these are the chars that can follow a true/false/null or number atom
+  // and nothing else
+  const uint32_t structural_or_whitespace_or_null_negated[256] = {
+      0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+  // return non-zero if not a structural or whitespace char
+  // zero otherwise
+  really_inline uint32_t is_not_structural_or_whitespace_or_null(uint8_t c)
+  {
+    return structural_or_whitespace_or_null_negated[c];
   }
-  WARN_UNUSED error_code stage2(const uint8_t *, size_t, parser &) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
+
+  const uint32_t structural_or_whitespace_negated[256] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+  // return non-zero if not a structural or whitespace char
+  // zero otherwise
+  really_inline uint32_t is_not_structural_or_whitespace(uint8_t c)
+  {
+    return structural_or_whitespace_negated[c];
   }
-  WARN_UNUSED error_code stage2(const uint8_t *, size_t, parser &, size_t &) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
+
+  const uint32_t structural_or_whitespace_or_null[256] = {
+      1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  really_inline uint32_t is_structural_or_whitespace_or_null(uint8_t c)
+  {
+    return structural_or_whitespace_or_null[c];
   }
 
-  unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
-};
+  const uint32_t structural_or_whitespace[256] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
-const unsupported_implementation unsupported_singleton{};
+  really_inline uint32_t is_structural_or_whitespace(uint8_t c)
+  {
+    return structural_or_whitespace[c];
+  }
 
-size_t available_implementation_list::size() const noexcept {
-  return internal::available_implementation_pointers.size();
-}
-const implementation * const *available_implementation_list::begin() const noexcept {
-  return internal::available_implementation_pointers.begin();
-}
-const implementation * const *available_implementation_list::end() const noexcept {
-  return internal::available_implementation_pointers.end();
-}
-const implementation *available_implementation_list::detect_best_supported() const noexcept {
-  // They are prelisted in priority order, so we just go down the list
-  uint32_t supported_instruction_sets = detect_supported_architectures();
-  for (const implementation *impl : internal::available_implementation_pointers) {
-    uint32_t required_instruction_sets = impl->required_instruction_sets();
-    if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
+  const uint32_t digit_to_val32[886] = {
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0x0, 0x1, 0x2, 0x3, 0x4, 0x5,
+      0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
+      0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xa, 0xb, 0xc, 0xd, 0xe,
+      0xf, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0x0, 0x10, 0x20, 0x30, 0x40, 0x50,
+      0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
+      0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0,
+      0xf0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0x0, 0x100, 0x200, 0x300, 0x400, 0x500,
+      0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
+      0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xa00, 0xb00, 0xc00, 0xd00, 0xe00,
+      0xf00, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000,
+      0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
+      0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xa000, 0xb000, 0xc000, 0xd000, 0xe000,
+      0xf000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
+  // returns a value with the high 16 bits set if not valid
+  // otherwise returns the conversion of the 4 hex digits at src into the bottom
+  // 16 bits of the 32-bit return register
+  //
+  // see
+  // https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
+  static inline uint32_t hex_to_u32_nocheck(
+      const uint8_t *src)
+  { // strictly speaking, static inline is a C-ism
+    uint32_t v1 = digit_to_val32[630 + src[0]];
+    uint32_t v2 = digit_to_val32[420 + src[1]];
+    uint32_t v3 = digit_to_val32[210 + src[2]];
+    uint32_t v4 = digit_to_val32[0 + src[3]];
+    return v1 | v2 | v3 | v4;
   }
-  return &unsupported_singleton;
-}
 
-const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
-  return active_implementation = available_implementations.detect_best_supported();
-}
+  // given a code point cp, writes to c
+  // the utf-8 code, outputting the length in
+  // bytes, if the length is zero, the code point
+  // is invalid
+  //
+  // This can possibly be made faster using pdep
+  // and clz and table lookups, but JSON documents
+  // have few escaped code points, and the following
+  // function looks cheap.
+  //
+  // Note: we assume that surrogates are treated separately
+  //
+  inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c)
+  {
+    if (cp <= 0x7F)
+    {
+      c[0] = uint8_t(cp);
+      return 1; // ascii
+    }
+    if (cp <= 0x7FF)
+    {
+      c[0] = uint8_t((cp >> 6) + 192);
+      c[1] = uint8_t((cp & 63) + 128);
+      return 2; // universal plane
+      //  Surrogates are treated elsewhere...
+      //} //else if (0xd800 <= cp && cp <= 0xdfff) {
+      //  return 0; // surrogates // could put assert here
+    }
+    else if (cp <= 0xFFFF)
+    {
+      c[0] = uint8_t((cp >> 12) + 224);
+      c[1] = uint8_t(((cp >> 6) & 63) + 128);
+      c[2] = uint8_t((cp & 63) + 128);
+      return 3;
+    }
+    else if (cp <= 0x10FFFF)
+    { // if you know you have a valid code point, this
+      // is not needed
+      c[0] = uint8_t((cp >> 18) + 240);
+      c[1] = uint8_t(((cp >> 12) & 63) + 128);
+      c[2] = uint8_t(((cp >> 6) & 63) + 128);
+      c[3] = uint8_t((cp & 63) + 128);
+      return 4;
+    }
+    // will return 0 when the code point was too large.
+    return 0; // bad r
+  }
 
-} // namespace simdjson::internal
-/* end file src/fallback/implementation.h */
-/* begin file src/stage1_find_marks.cpp */
+  ////
+  // The following code is used in number parsing. It is not
+  // properly "char utils" stuff, but we move it here so that
+  // it does not get copied multiple times in the binaries (once
+  // per instruction set).
+  ///
+
+  constexpr int FASTFLOAT_SMALLEST_POWER = -325;
+  constexpr int FASTFLOAT_LARGEST_POWER = 308;
+
+  struct value128
+  {
+    uint64_t low;
+    uint64_t high;
+  };
+
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+  // this is a slow emulation routine for 32-bit
+  //
+  static inline uint64_t __emulu(uint32_t x, uint32_t y)
+  {
+    return x * (uint64_t)y;
+  }
+  static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi)
+  {
+    uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+    uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
+    uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+    uint64_t adbc_carry = !!(adbc < ad);
+    uint64_t lo = bd + (adbc << 32);
+    *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+          (adbc_carry << 32) + !!(lo < bd);
+    return lo;
+  }
+#endif
+
+  really_inline value128 full_multiplication(uint64_t value1, uint64_t value2)
+  {
+    value128 answer;
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#ifdef _M_ARM64
+    // ARM64 has native support for 64-bit multiplications, no need to emultate
+    answer.high = __umulh(value1, value2);
+    answer.low = value1 * value2;
+#else
+    answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else  // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+    __uint128_t r = ((__uint128_t)value1) * value2;
+    answer.low = uint64_t(r);
+    answer.high = uint64_t(r >> 64);
+#endif
+    return answer;
+  }
+
+  // Precomputed powers of ten from 10^0 to 10^22. These
+  // can be represented exactly using the double type.
+  static const double power_of_ten[] = {
+      1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11,
+      1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+
+  // the mantissas of powers of ten from -308 to 308, extended out to sixty four
+  // bits
+  // This struct will likely get padded to 16 bytes.
+  typedef struct
+  {
+    uint64_t mantissa;
+    int32_t exp;
+  } components;
+
+  // The array power_of_ten_components contain the powers of ten approximated
+  // as a 64-bit mantissa, with an exponent part. It goes from 10^
+  // FASTFLOAT_SMALLEST_POWER to
+  // 10^FASTFLOAT_LARGEST_POWER (inclusively). The mantissa is truncated, and
+  // never rounded up.
+  // Uses about 10KB.
+  static const components power_of_ten_components[] = {
+      {0xa5ced43b7e3e9188L, 7}, {0xcf42894a5dce35eaL, 10}, {0x818995ce7aa0e1b2L, 14}, {0xa1ebfb4219491a1fL, 17}, {0xca66fa129f9b60a6L, 20}, {0xfd00b897478238d0L, 23}, {0x9e20735e8cb16382L, 27}, {0xc5a890362fddbc62L, 30}, {0xf712b443bbd52b7bL, 33}, {0x9a6bb0aa55653b2dL, 37}, {0xc1069cd4eabe89f8L, 40}, {0xf148440a256e2c76L, 43}, {0x96cd2a865764dbcaL, 47}, {0xbc807527ed3e12bcL, 50}, {0xeba09271e88d976bL, 53}, {0x93445b8731587ea3L, 57}, {0xb8157268fdae9e4cL, 60}, {0xe61acf033d1a45dfL, 63}, {0x8fd0c16206306babL, 67}, {0xb3c4f1ba87bc8696L, 70}, {0xe0b62e2929aba83cL, 73}, {0x8c71dcd9ba0b4925L, 77}, {0xaf8e5410288e1b6fL, 80}, {0xdb71e91432b1a24aL, 83}, {0x892731ac9faf056eL, 87}, {0xab70fe17c79ac6caL, 90}, {0xd64d3d9db981787dL, 93}, {0x85f0468293f0eb4eL, 97}, {0xa76c582338ed2621L, 100}, {0xd1476e2c07286faaL, 103}, {0x82cca4db847945caL, 107}, {0xa37fce126597973cL, 110}, {0xcc5fc196fefd7d0cL, 113}, {0xff77b1fcbebcdc4fL, 116}, {0x9faacf3df73609b1L, 120}, {0xc795830d75038c1dL, 123}, {0xf97ae3d0d2446f25L, 126}, {0x9becce62836ac577L, 130}, {0xc2e801fb244576d5L, 133}, {0xf3a20279ed56d48aL, 136}, {0x9845418c345644d6L, 140}, {0xbe5691ef416bd60cL, 143}, {0xedec366b11c6cb8fL, 146}, {0x94b3a202eb1c3f39L, 150}, {0xb9e08a83a5e34f07L, 153}, {0xe858ad248f5c22c9L, 156}, {0x91376c36d99995beL, 160}, {0xb58547448ffffb2dL, 163}, {0xe2e69915b3fff9f9L, 166}, {0x8dd01fad907ffc3bL, 170}, {0xb1442798f49ffb4aL, 173}, {0xdd95317f31c7fa1dL, 176}, {0x8a7d3eef7f1cfc52L, 180}, {0xad1c8eab5ee43b66L, 183}, {0xd863b256369d4a40L, 186}, {0x873e4f75e2224e68L, 190}, {0xa90de3535aaae202L, 193}, {0xd3515c2831559a83L, 196}, {0x8412d9991ed58091L, 200}, {0xa5178fff668ae0b6L, 203}, {0xce5d73ff402d98e3L, 206}, {0x80fa687f881c7f8eL, 210}, {0xa139029f6a239f72L, 213}, {0xc987434744ac874eL, 216}, {0xfbe9141915d7a922L, 219}, {0x9d71ac8fada6c9b5L, 223}, {0xc4ce17b399107c22L, 226}, {0xf6019da07f549b2bL, 229}, {0x99c102844f94e0fbL, 233}, {0xc0314325637a1939L, 236}, {0xf03d93eebc589f88L, 239}, {0x96267c7535b763b5L, 243}, {0xbbb01b9283253ca2L, 246}, {0xea9c227723ee8bcbL, 249}, {0x92a1958a7675175fL, 253}, {0xb749faed14125d36L, 256}, {0xe51c79a85916f484L, 259}, {0x8f31cc0937ae58d2L, 263}, {0xb2fe3f0b8599ef07L, 266}, {0xdfbdcece67006ac9L, 269}, {0x8bd6a141006042bdL, 273}, {0xaecc49914078536dL, 276}, {0xda7f5bf590966848L, 279}, {0x888f99797a5e012dL, 283}, {0xaab37fd7d8f58178L, 286}, {0xd5605fcdcf32e1d6L, 289}, {0x855c3be0a17fcd26L, 293}, {0xa6b34ad8c9dfc06fL, 296}, {0xd0601d8efc57b08bL, 299}, {0x823c12795db6ce57L, 303}, {0xa2cb1717b52481edL, 306}, {0xcb7ddcdda26da268L, 309}, {0xfe5d54150b090b02L, 312}, {0x9efa548d26e5a6e1L, 316}, {0xc6b8e9b0709f109aL, 319}, {0xf867241c8cc6d4c0L, 322}, {0x9b407691d7fc44f8L, 326}, {0xc21094364dfb5636L, 329}, {0xf294b943e17a2bc4L, 332}, {0x979cf3ca6cec5b5aL, 336}, {0xbd8430bd08277231L, 339}, {0xece53cec4a314ebdL, 342}, {0x940f4613ae5ed136L, 346}, {0xb913179899f68584L, 349}, {0xe757dd7ec07426e5L, 352}, {0x9096ea6f3848984fL, 356}, {0xb4bca50b065abe63L, 359}, {0xe1ebce4dc7f16dfbL, 362}, {0x8d3360f09cf6e4bdL, 366}, {0xb080392cc4349decL, 369}, {0xdca04777f541c567L, 372}, {0x89e42caaf9491b60L, 376}, {0xac5d37d5b79b6239L, 379}, {0xd77485cb25823ac7L, 382}, {0x86a8d39ef77164bcL, 386}, {0xa8530886b54dbdebL, 389}, {0xd267caa862a12d66L, 392}, {0x8380dea93da4bc60L, 396}, {0xa46116538d0deb78L, 399}, {0xcd795be870516656L, 402}, {0x806bd9714632dff6L, 406}, {0xa086cfcd97bf97f3L, 409}, {0xc8a883c0fdaf7df0L, 412}, {0xfad2a4b13d1b5d6cL, 415}, {0x9cc3a6eec6311a63L, 419}, {0xc3f490aa77bd60fcL, 422}, {0xf4f1b4d515acb93bL, 425}, {0x991711052d8bf3c5L, 429}, {0xbf5cd54678eef0b6L, 432}, {0xef340a98172aace4L, 435}, {0x9580869f0e7aac0eL, 439}, {0xbae0a846d2195712L, 442}, {0xe998d258869facd7L, 445}, {0x91ff83775423cc06L, 449}, {0xb67f6455292cbf08L, 452}, {0xe41f3d6a7377eecaL, 455}, {0x8e938662882af53eL, 459}, {0xb23867fb2a35b28dL, 462}, {0xdec681f9f4c31f31L, 465}, {0x8b3c113c38f9f37eL, 469}, {0xae0b158b4738705eL, 472}, {0xd98ddaee19068c76L, 475}, {0x87f8a8d4cfa417c9L, 479}, {0xa9f6d30a038d1dbcL, 482}, {0xd47487cc8470652bL, 485}, {0x84c8d4dfd2c63f3bL, 489}, {0xa5fb0a17c777cf09L, 492}, {0xcf79cc9db955c2ccL, 495}, {0x81ac1fe293d599bfL, 499}, {0xa21727db38cb002fL, 502}, {0xca9cf1d206fdc03bL, 505}, {0xfd442e4688bd304aL, 508}, {0x9e4a9cec15763e2eL, 512}, {0xc5dd44271ad3cdbaL, 515}, {0xf7549530e188c128L, 518}, {0x9a94dd3e8cf578b9L, 522}, {0xc13a148e3032d6e7L, 525}, {0xf18899b1bc3f8ca1L, 528}, {0x96f5600f15a7b7e5L, 532}, {0xbcb2b812db11a5deL, 535}, {0xebdf661791d60f56L, 538}, {0x936b9fcebb25c995L, 542}, {0xb84687c269ef3bfbL, 545}, {0xe65829b3046b0afaL, 548}, {0x8ff71a0fe2c2e6dcL, 552}, {0xb3f4e093db73a093L, 555}, {0xe0f218b8d25088b8L, 558}, {0x8c974f7383725573L, 562}, {0xafbd2350644eeacfL, 565}, {0xdbac6c247d62a583L, 568}, {0x894bc396ce5da772L, 572}, {0xab9eb47c81f5114fL, 575}, {0xd686619ba27255a2L, 578}, {0x8613fd0145877585L, 582}, {0xa798fc4196e952e7L, 585}, {0xd17f3b51fca3a7a0L, 588}, {0x82ef85133de648c4L, 592}, {0xa3ab66580d5fdaf5L, 595}, {0xcc963fee10b7d1b3L, 598}, {0xffbbcfe994e5c61fL, 601}, {0x9fd561f1fd0f9bd3L, 605}, {0xc7caba6e7c5382c8L, 608}, {0xf9bd690a1b68637bL, 611}, {0x9c1661a651213e2dL, 615}, {0xc31bfa0fe5698db8L, 618}, {0xf3e2f893dec3f126L, 621}, {0x986ddb5c6b3a76b7L, 625}, {0xbe89523386091465L, 628}, {0xee2ba6c0678b597fL, 631}, {0x94db483840b717efL, 635}, {0xba121a4650e4ddebL, 638}, {0xe896a0d7e51e1566L, 641}, {0x915e2486ef32cd60L, 645}, {0xb5b5ada8aaff80b8L, 648}, {0xe3231912d5bf60e6L, 651}, {0x8df5efabc5979c8fL, 655}, {0xb1736b96b6fd83b3L, 658}, {0xddd0467c64bce4a0L, 661}, {0x8aa22c0dbef60ee4L, 665}, {0xad4ab7112eb3929dL, 668}, {0xd89d64d57a607744L, 671}, {0x87625f056c7c4a8bL, 675}, {0xa93af6c6c79b5d2dL, 678}, {0xd389b47879823479L, 681}, {0x843610cb4bf160cbL, 685}, {0xa54394fe1eedb8feL, 688}, {0xce947a3da6a9273eL, 691}, {0x811ccc668829b887L, 695}, {0xa163ff802a3426a8L, 698}, {0xc9bcff6034c13052L, 701}, {0xfc2c3f3841f17c67L, 704}, {0x9d9ba7832936edc0L, 708}, {0xc5029163f384a931L, 711}, {0xf64335bcf065d37dL, 714}, {0x99ea0196163fa42eL, 718}, {0xc06481fb9bcf8d39L, 721}, {0xf07da27a82c37088L, 724}, {0x964e858c91ba2655L, 728}, {0xbbe226efb628afeaL, 731}, {0xeadab0aba3b2dbe5L, 734}, {0x92c8ae6b464fc96fL, 738}, {0xb77ada0617e3bbcbL, 741}, {0xe55990879ddcaabdL, 744}, {0x8f57fa54c2a9eab6L, 748}, {0xb32df8e9f3546564L, 751}, {0xdff9772470297ebdL, 754}, {0x8bfbea76c619ef36L, 758}, {0xaefae51477a06b03L, 761}, {0xdab99e59958885c4L, 764}, {0x88b402f7fd75539bL, 768}, {0xaae103b5fcd2a881L, 771}, {0xd59944a37c0752a2L, 774}, {0x857fcae62d8493a5L, 778}, {0xa6dfbd9fb8e5b88eL, 781}, {0xd097ad07a71f26b2L, 784}, {0x825ecc24c873782fL, 788}, {0xa2f67f2dfa90563bL, 791}, {0xcbb41ef979346bcaL, 794}, {0xfea126b7d78186bcL, 797}, {0x9f24b832e6b0f436L, 801}, {0xc6ede63fa05d3143L, 804}, {0xf8a95fcf88747d94L, 807}, {0x9b69dbe1b548ce7cL, 811}, {0xc24452da229b021bL, 814}, {0xf2d56790ab41c2a2L, 817}, {0x97c560ba6b0919a5L, 821}, {0xbdb6b8e905cb600fL, 824}, {0xed246723473e3813L, 827}, {0x9436c0760c86e30bL, 831}, {0xb94470938fa89bceL, 834}, {0xe7958cb87392c2c2L, 837}, {0x90bd77f3483bb9b9L, 841}, {0xb4ecd5f01a4aa828L, 844}, {0xe2280b6c20dd5232L, 847}, {0x8d590723948a535fL, 851}, {0xb0af48ec79ace837L, 854}, {0xdcdb1b2798182244L, 857}, {0x8a08f0f8bf0f156bL, 861}, {0xac8b2d36eed2dac5L, 864}, {0xd7adf884aa879177L, 867}, {0x86ccbb52ea94baeaL, 871}, {0xa87fea27a539e9a5L, 874}, {0xd29fe4b18e88640eL, 877}, {0x83a3eeeef9153e89L, 881}, {0xa48ceaaab75a8e2bL, 884}, {0xcdb02555653131b6L, 887}, {0x808e17555f3ebf11L, 891}, {0xa0b19d2ab70e6ed6L, 894}, {0xc8de047564d20a8bL, 897}, {0xfb158592be068d2eL, 900}, {0x9ced737bb6c4183dL, 904}, {0xc428d05aa4751e4cL, 907}, {0xf53304714d9265dfL, 910}, {0x993fe2c6d07b7fabL, 914}, {0xbf8fdb78849a5f96L, 917}, {0xef73d256a5c0f77cL, 920}, {0x95a8637627989aadL, 924}, {0xbb127c53b17ec159L, 927}, {0xe9d71b689dde71afL, 930}, {0x9226712162ab070dL, 934}, {0xb6b00d69bb55c8d1L, 937}, {0xe45c10c42a2b3b05L, 940}, {0x8eb98a7a9a5b04e3L, 944}, {0xb267ed1940f1c61cL, 947}, {0xdf01e85f912e37a3L, 950}, {0x8b61313bbabce2c6L, 954}, {0xae397d8aa96c1b77L, 957}, {0xd9c7dced53c72255L, 960}, {0x881cea14545c7575L, 964}, {0xaa242499697392d2L, 967}, {0xd4ad2dbfc3d07787L, 970}, {0x84ec3c97da624ab4L, 974}, {0xa6274bbdd0fadd61L, 977}, {0xcfb11ead453994baL, 980}, {0x81ceb32c4b43fcf4L, 984}, {0xa2425ff75e14fc31L, 987}, {0xcad2f7f5359a3b3eL, 990}, {0xfd87b5f28300ca0dL, 993}, {0x9e74d1b791e07e48L, 997}, {0xc612062576589ddaL, 1000}, {0xf79687aed3eec551L, 1003}, {0x9abe14cd44753b52L, 1007}, {0xc16d9a0095928a27L, 1010}, {0xf1c90080baf72cb1L, 1013}, {0x971da05074da7beeL, 1017}, {0xbce5086492111aeaL, 1020}, {0xec1e4a7db69561a5L, 1023}, {0x9392ee8e921d5d07L, 1027}, {0xb877aa3236a4b449L, 1030}, {0xe69594bec44de15bL, 1033}, {0x901d7cf73ab0acd9L, 1037}, {0xb424dc35095cd80fL, 1040}, {0xe12e13424bb40e13L, 1043}, {0x8cbccc096f5088cbL, 1047}, {0xafebff0bcb24aafeL, 1050}, {0xdbe6fecebdedd5beL, 1053}, {0x89705f4136b4a597L, 1057}, {0xabcc77118461cefcL, 1060}, {0xd6bf94d5e57a42bcL, 1063}, {0x8637bd05af6c69b5L, 1067}, {0xa7c5ac471b478423L, 1070}, {0xd1b71758e219652bL, 1073}, {0x83126e978d4fdf3bL, 1077}, {0xa3d70a3d70a3d70aL, 1080}, {0xccccccccccccccccL, 1083}, {0x8000000000000000L, 1087}, {0xa000000000000000L, 1090}, {0xc800000000000000L, 1093}, {0xfa00000000000000L, 1096}, {0x9c40000000000000L, 1100}, {0xc350000000000000L, 1103}, {0xf424000000000000L, 1106}, {0x9896800000000000L, 1110}, {0xbebc200000000000L, 1113}, {0xee6b280000000000L, 1116}, {0x9502f90000000000L, 1120}, {0xba43b74000000000L, 1123}, {0xe8d4a51000000000L, 1126}, {0x9184e72a00000000L, 1130}, {0xb5e620f480000000L, 1133}, {0xe35fa931a0000000L, 1136}, {0x8e1bc9bf04000000L, 1140}, {0xb1a2bc2ec5000000L, 1143}, {0xde0b6b3a76400000L, 1146}, {0x8ac7230489e80000L, 1150}, {0xad78ebc5ac620000L, 1153}, {0xd8d726b7177a8000L, 1156}, {0x878678326eac9000L, 1160}, {0xa968163f0a57b400L, 1163}, {0xd3c21bcecceda100L, 1166}, {0x84595161401484a0L, 1170}, {0xa56fa5b99019a5c8L, 1173}, {0xcecb8f27f4200f3aL, 1176}, {0x813f3978f8940984L, 1180}, {0xa18f07d736b90be5L, 1183}, {0xc9f2c9cd04674edeL, 1186}, {0xfc6f7c4045812296L, 1189}, {0x9dc5ada82b70b59dL, 1193}, {0xc5371912364ce305L, 1196}, {0xf684df56c3e01bc6L, 1199}, {0x9a130b963a6c115cL, 1203}, {0xc097ce7bc90715b3L, 1206}, {0xf0bdc21abb48db20L, 1209}, {0x96769950b50d88f4L, 1213}, {0xbc143fa4e250eb31L, 1216}, {0xeb194f8e1ae525fdL, 1219}, {0x92efd1b8d0cf37beL, 1223}, {0xb7abc627050305adL, 1226}, {0xe596b7b0c643c719L, 1229}, {0x8f7e32ce7bea5c6fL, 1233}, {0xb35dbf821ae4f38bL, 1236}, {0xe0352f62a19e306eL, 1239}, {0x8c213d9da502de45L, 1243}, {0xaf298d050e4395d6L, 1246}, {0xdaf3f04651d47b4cL, 1249}, {0x88d8762bf324cd0fL, 1253}, {0xab0e93b6efee0053L, 1256}, {0xd5d238a4abe98068L, 1259}, {0x85a36366eb71f041L, 1263}, {0xa70c3c40a64e6c51L, 1266}, {0xd0cf4b50cfe20765L, 1269}, {0x82818f1281ed449fL, 1273}, {0xa321f2d7226895c7L, 1276}, {0xcbea6f8ceb02bb39L, 1279}, {0xfee50b7025c36a08L, 1282}, {0x9f4f2726179a2245L, 1286}, {0xc722f0ef9d80aad6L, 1289}, {0xf8ebad2b84e0d58bL, 1292}, {0x9b934c3b330c8577L, 1296}, {0xc2781f49ffcfa6d5L, 1299}, {0xf316271c7fc3908aL, 1302}, {0x97edd871cfda3a56L, 1306}, {0xbde94e8e43d0c8ecL, 1309}, {0xed63a231d4c4fb27L, 1312}, {0x945e455f24fb1cf8L, 1316}, {0xb975d6b6ee39e436L, 1319}, {0xe7d34c64a9c85d44L, 1322}, {0x90e40fbeea1d3a4aL, 1326}, {0xb51d13aea4a488ddL, 1329}, {0xe264589a4dcdab14L, 1332}, {0x8d7eb76070a08aecL, 1336}, {0xb0de65388cc8ada8L, 1339}, {0xdd15fe86affad912L, 1342}, {0x8a2dbf142dfcc7abL, 1346}, {0xacb92ed9397bf996L, 1349}, {0xd7e77a8f87daf7fbL, 1352}, {0x86f0ac99b4e8dafdL, 1356}, {0xa8acd7c0222311bcL, 1359}, {0xd2d80db02aabd62bL, 1362}, {0x83c7088e1aab65dbL, 1366}, {0xa4b8cab1a1563f52L, 1369}, {0xcde6fd5e09abcf26L, 1372}, {0x80b05e5ac60b6178L, 1376}, {0xa0dc75f1778e39d6L, 1379}, {0xc913936dd571c84cL, 1382}, {0xfb5878494ace3a5fL, 1385}, {0x9d174b2dcec0e47bL, 1389}, {0xc45d1df942711d9aL, 1392}, {0xf5746577930d6500L, 1395}, {0x9968bf6abbe85f20L, 1399}, {0xbfc2ef456ae276e8L, 1402}, {0xefb3ab16c59b14a2L, 1405}, {0x95d04aee3b80ece5L, 1409}, {0xbb445da9ca61281fL, 1412}, {0xea1575143cf97226L, 1415}, {0x924d692ca61be758L, 1419}, {0xb6e0c377cfa2e12eL, 1422}, {0xe498f455c38b997aL, 1425}, {0x8edf98b59a373fecL, 1429}, {0xb2977ee300c50fe7L, 1432}, {0xdf3d5e9bc0f653e1L, 1435}, {0x8b865b215899f46cL, 1439}, {0xae67f1e9aec07187L, 1442}, {0xda01ee641a708de9L, 1445}, {0x884134fe908658b2L, 1449}, {0xaa51823e34a7eedeL, 1452}, {0xd4e5e2cdc1d1ea96L, 1455}, {0x850fadc09923329eL, 1459}, {0xa6539930bf6bff45L, 1462}, {0xcfe87f7cef46ff16L, 1465}, {0x81f14fae158c5f6eL, 1469}, {0xa26da3999aef7749L, 1472}, {0xcb090c8001ab551cL, 1475}, {0xfdcb4fa002162a63L, 1478}, {0x9e9f11c4014dda7eL, 1482}, {0xc646d63501a1511dL, 1485}, {0xf7d88bc24209a565L, 1488}, {0x9ae757596946075fL, 1492}, {0xc1a12d2fc3978937L, 1495}, {0xf209787bb47d6b84L, 1498}, {0x9745eb4d50ce6332L, 1502}, {0xbd176620a501fbffL, 1505}, {0xec5d3fa8ce427affL, 1508}, {0x93ba47c980e98cdfL, 1512}, {0xb8a8d9bbe123f017L, 1515}, {0xe6d3102ad96cec1dL, 1518}, {0x9043ea1ac7e41392L, 1522}, {0xb454e4a179dd1877L, 1525}, {0xe16a1dc9d8545e94L, 1528}, {0x8ce2529e2734bb1dL, 1532}, {0xb01ae745b101e9e4L, 1535}, {0xdc21a1171d42645dL, 1538}, {0x899504ae72497ebaL, 1542}, {0xabfa45da0edbde69L, 1545}, {0xd6f8d7509292d603L, 1548}, {0x865b86925b9bc5c2L, 1552}, {0xa7f26836f282b732L, 1555}, {0xd1ef0244af2364ffL, 1558}, {0x8335616aed761f1fL, 1562}, {0xa402b9c5a8d3a6e7L, 1565}, {0xcd036837130890a1L, 1568}, {0x802221226be55a64L, 1572}, {0xa02aa96b06deb0fdL, 1575}, {0xc83553c5c8965d3dL, 1578}, {0xfa42a8b73abbf48cL, 1581}, {0x9c69a97284b578d7L, 1585}, {0xc38413cf25e2d70dL, 1588}, {0xf46518c2ef5b8cd1L, 1591}, {0x98bf2f79d5993802L, 1595}, {0xbeeefb584aff8603L, 1598}, {0xeeaaba2e5dbf6784L, 1601}, {0x952ab45cfa97a0b2L, 1605}, {0xba756174393d88dfL, 1608}, {0xe912b9d1478ceb17L, 1611}, {0x91abb422ccb812eeL, 1615}, {0xb616a12b7fe617aaL, 1618}, {0xe39c49765fdf9d94L, 1621}, {0x8e41ade9fbebc27dL, 1625}, {0xb1d219647ae6b31cL, 1628}, {0xde469fbd99a05fe3L, 1631}, {0x8aec23d680043beeL, 1635}, {0xada72ccc20054ae9L, 1638}, {0xd910f7ff28069da4L, 1641}, {0x87aa9aff79042286L, 1645}, {0xa99541bf57452b28L, 1648}, {0xd3fa922f2d1675f2L, 1651}, {0x847c9b5d7c2e09b7L, 1655}, {0xa59bc234db398c25L, 1658}, {0xcf02b2c21207ef2eL, 1661}, {0x8161afb94b44f57dL, 1665}, {0xa1ba1ba79e1632dcL, 1668}, {0xca28a291859bbf93L, 1671}, {0xfcb2cb35e702af78L, 1674}, {0x9defbf01b061adabL, 1678}, {0xc56baec21c7a1916L, 1681}, {0xf6c69a72a3989f5bL, 1684}, {0x9a3c2087a63f6399L, 1688}, {0xc0cb28a98fcf3c7fL, 1691}, {0xf0fdf2d3f3c30b9fL, 1694}, {0x969eb7c47859e743L, 1698}, {0xbc4665b596706114L, 1701}, {0xeb57ff22fc0c7959L, 1704}, {0x9316ff75dd87cbd8L, 1708}, {0xb7dcbf5354e9beceL, 1711}, {0xe5d3ef282a242e81L, 1714}, {0x8fa475791a569d10L, 1718}, {0xb38d92d760ec4455L, 1721}, {0xe070f78d3927556aL, 1724}, {0x8c469ab843b89562L, 1728}, {0xaf58416654a6babbL, 1731}, {0xdb2e51bfe9d0696aL, 1734}, {0x88fcf317f22241e2L, 1738}, {0xab3c2fddeeaad25aL, 1741}, {0xd60b3bd56a5586f1L, 1744}, {0x85c7056562757456L, 1748}, {0xa738c6bebb12d16cL, 1751}, {0xd106f86e69d785c7L, 1754}, {0x82a45b450226b39cL, 1758}, {0xa34d721642b06084L, 1761}, {0xcc20ce9bd35c78a5L, 1764}, {0xff290242c83396ceL, 1767}, {0x9f79a169bd203e41L, 1771}, {0xc75809c42c684dd1L, 1774}, {0xf92e0c3537826145L, 1777}, {0x9bbcc7a142b17ccbL, 1781}, {0xc2abf989935ddbfeL, 1784}, {0xf356f7ebf83552feL, 1787}, {0x98165af37b2153deL, 1791}, {0xbe1bf1b059e9a8d6L, 1794}, {0xeda2ee1c7064130cL, 1797}, {0x9485d4d1c63e8be7L, 1801}, {0xb9a74a0637ce2ee1L, 1804}, {0xe8111c87c5c1ba99L, 1807}, {0x910ab1d4db9914a0L, 1811}, {0xb54d5e4a127f59c8L, 1814}, {0xe2a0b5dc971f303aL, 1817}, {0x8da471a9de737e24L, 1821}, {0xb10d8e1456105dadL, 1824}, {0xdd50f1996b947518L, 1827}, {0x8a5296ffe33cc92fL, 1831}, {0xace73cbfdc0bfb7bL, 1834}, {0xd8210befd30efa5aL, 1837}, {0x8714a775e3e95c78L, 1841}, {0xa8d9d1535ce3b396L, 1844}, {0xd31045a8341ca07cL, 1847}, {0x83ea2b892091e44dL, 1851}, {0xa4e4b66b68b65d60L, 1854}, {0xce1de40642e3f4b9L, 1857}, {0x80d2ae83e9ce78f3L, 1861}, {0xa1075a24e4421730L, 1864}, {0xc94930ae1d529cfcL, 1867}, {0xfb9b7cd9a4a7443cL, 1870}, {0x9d412e0806e88aa5L, 1874}, {0xc491798a08a2ad4eL, 1877}, {0xf5b5d7ec8acb58a2L, 1880}, {0x9991a6f3d6bf1765L, 1884}, {0xbff610b0cc6edd3fL, 1887}, {0xeff394dcff8a948eL, 1890}, {0x95f83d0a1fb69cd9L, 1894}, {0xbb764c4ca7a4440fL, 1897}, {0xea53df5fd18d5513L, 1900}, {0x92746b9be2f8552cL, 1904}, {0xb7118682dbb66a77L, 1907}, {0xe4d5e82392a40515L, 1910}, {0x8f05b1163ba6832dL, 1914}, {0xb2c71d5bca9023f8L, 1917}, {0xdf78e4b2bd342cf6L, 1920}, {0x8bab8eefb6409c1aL, 1924}, {0xae9672aba3d0c320L, 1927}, {0xda3c0f568cc4f3e8L, 1930}, {0x8865899617fb1871L, 1934}, {0xaa7eebfb9df9de8dL, 1937}, {0xd51ea6fa85785631L, 1940}, {0x8533285c936b35deL, 1944}, {0xa67ff273b8460356L, 1947}, {0xd01fef10a657842cL, 1950}, {0x8213f56a67f6b29bL, 1954}, {0xa298f2c501f45f42L, 1957}, {0xcb3f2f7642717713L, 1960}, {0xfe0efb53d30dd4d7L, 1963}, {0x9ec95d1463e8a506L, 1967}, {0xc67bb4597ce2ce48L, 1970}, {0xf81aa16fdc1b81daL, 1973}, {0x9b10a4e5e9913128L, 1977}, {0xc1d4ce1f63f57d72L, 1980}, {0xf24a01a73cf2dccfL, 1983}, {0x976e41088617ca01L, 1987}, {0xbd49d14aa79dbc82L, 1990}, {0xec9c459d51852ba2L, 1993}, {0x93e1ab8252f33b45L, 1997}, {0xb8da1662e7b00a17L, 2000}, {0xe7109bfba19c0c9dL, 2003}, {0x906a617d450187e2L, 2007}, {0xb484f9dc9641e9daL, 2010}, {0xe1a63853bbd26451L, 2013}, {0x8d07e33455637eb2L, 2017}, {0xb049dc016abc5e5fL, 2020}, {0xdc5c5301c56b75f7L, 2023}, {0x89b9b3e11b6329baL, 2027}, {0xac2820d9623bf429L, 2030}, {0xd732290fbacaf133L, 2033}, {0x867f59a9d4bed6c0L, 2037}, {0xa81f301449ee8c70L, 2040}, {0xd226fc195c6a2f8cL, 2043}, {0x83585d8fd9c25db7L, 2047}, {0xa42e74f3d032f525L, 2050}, {0xcd3a1230c43fb26fL, 2053}, {0x80444b5e7aa7cf85L, 2057}, {0xa0555e361951c366L, 2060}, {0xc86ab5c39fa63440L, 2063}, {0xfa856334878fc150L, 2066}, {0x9c935e00d4b9d8d2L, 2070}, {0xc3b8358109e84f07L, 2073}, {0xf4a642e14c6262c8L, 2076}, {0x98e7e9cccfbd7dbdL, 2080}, {0xbf21e44003acdd2cL, 2083}, {0xeeea5d5004981478L, 2086}, {0x95527a5202df0ccbL, 2090}, {0xbaa718e68396cffdL, 2093}, {0xe950df20247c83fdL, 2096}, {0x91d28b7416cdd27eL, 2100}, {0xb6472e511c81471dL, 2103}, {0xe3d8f9e563a198e5L, 2106}, {0x8e679c2f5e44ff8fL, 2110}};
+
+  // A complement from power_of_ten_components
+  // complete to a 128-bit mantissa.
+  const uint64_t mantissa_128[] = {
+      0x419ea3bd35385e2d,
+      0x52064cac828675b9,
+      0x7343efebd1940993,
+      0x1014ebe6c5f90bf8,
+      0xd41a26e077774ef6,
+      0x8920b098955522b4,
+      0x55b46e5f5d5535b0,
+      0xeb2189f734aa831d,
+      0xa5e9ec7501d523e4,
+      0x47b233c92125366e,
+      0x999ec0bb696e840a,
+      0xc00670ea43ca250d,
+      0x380406926a5e5728,
+      0xc605083704f5ecf2,
+      0xf7864a44c633682e,
+      0x7ab3ee6afbe0211d,
+      0x5960ea05bad82964,
+      0x6fb92487298e33bd,
+      0xa5d3b6d479f8e056,
+      0x8f48a4899877186c,
+      0x331acdabfe94de87,
+      0x9ff0c08b7f1d0b14,
+      0x7ecf0ae5ee44dd9,
+      0xc9e82cd9f69d6150,
+      0xbe311c083a225cd2,
+      0x6dbd630a48aaf406,
+      0x92cbbccdad5b108,
+      0x25bbf56008c58ea5,
+      0xaf2af2b80af6f24e,
+      0x1af5af660db4aee1,
+      0x50d98d9fc890ed4d,
+      0xe50ff107bab528a0,
+      0x1e53ed49a96272c8,
+      0x25e8e89c13bb0f7a,
+      0x77b191618c54e9ac,
+      0xd59df5b9ef6a2417,
+      0x4b0573286b44ad1d,
+      0x4ee367f9430aec32,
+      0x229c41f793cda73f,
+      0x6b43527578c1110f,
+      0x830a13896b78aaa9,
+      0x23cc986bc656d553,
+      0x2cbfbe86b7ec8aa8,
+      0x7bf7d71432f3d6a9,
+      0xdaf5ccd93fb0cc53,
+      0xd1b3400f8f9cff68,
+      0x23100809b9c21fa1,
+      0xabd40a0c2832a78a,
+      0x16c90c8f323f516c,
+      0xae3da7d97f6792e3,
+      0x99cd11cfdf41779c,
+      0x40405643d711d583,
+      0x482835ea666b2572,
+      0xda3243650005eecf,
+      0x90bed43e40076a82,
+      0x5a7744a6e804a291,
+      0x711515d0a205cb36,
+      0xd5a5b44ca873e03,
+      0xe858790afe9486c2,
+      0x626e974dbe39a872,
+      0xfb0a3d212dc8128f,
+      0x7ce66634bc9d0b99,
+      0x1c1fffc1ebc44e80,
+      0xa327ffb266b56220,
+      0x4bf1ff9f0062baa8,
+      0x6f773fc3603db4a9,
+      0xcb550fb4384d21d3,
+      0x7e2a53a146606a48,
+      0x2eda7444cbfc426d,
+      0xfa911155fefb5308,
+      0x793555ab7eba27ca,
+      0x4bc1558b2f3458de,
+      0x9eb1aaedfb016f16,
+      0x465e15a979c1cadc,
+      0xbfacd89ec191ec9,
+      0xcef980ec671f667b,
+      0x82b7e12780e7401a,
+      0xd1b2ecb8b0908810,
+      0x861fa7e6dcb4aa15,
+      0x67a791e093e1d49a,
+      0xe0c8bb2c5c6d24e0,
+      0x58fae9f773886e18,
+      0xaf39a475506a899e,
+      0x6d8406c952429603,
+      0xc8e5087ba6d33b83,
+      0xfb1e4a9a90880a64,
+      0x5cf2eea09a55067f,
+      0xf42faa48c0ea481e,
+      0xf13b94daf124da26,
+      0x76c53d08d6b70858,
+      0x54768c4b0c64ca6e,
+      0xa9942f5dcf7dfd09,
+      0xd3f93b35435d7c4c,
+      0xc47bc5014a1a6daf,
+      0x359ab6419ca1091b,
+      0xc30163d203c94b62,
+      0x79e0de63425dcf1d,
+      0x985915fc12f542e4,
+      0x3e6f5b7b17b2939d,
+      0xa705992ceecf9c42,
+      0x50c6ff782a838353,
+      0xa4f8bf5635246428,
+      0x871b7795e136be99,
+      0x28e2557b59846e3f,
+      0x331aeada2fe589cf,
+      0x3ff0d2c85def7621,
+      0xfed077a756b53a9,
+      0xd3e8495912c62894,
+      0x64712dd7abbbd95c,
+      0xbd8d794d96aacfb3,
+      0xecf0d7a0fc5583a0,
+      0xf41686c49db57244,
+      0x311c2875c522ced5,
+      0x7d633293366b828b,
+      0xae5dff9c02033197,
+      0xd9f57f830283fdfc,
+      0xd072df63c324fd7b,
+      0x4247cb9e59f71e6d,
+      0x52d9be85f074e608,
+      0x67902e276c921f8b,
+      0xba1cd8a3db53b6,
+      0x80e8a40eccd228a4,
+      0x6122cd128006b2cd,
+      0x796b805720085f81,
+      0xcbe3303674053bb0,
+      0xbedbfc4411068a9c,
+      0xee92fb5515482d44,
+      0x751bdd152d4d1c4a,
+      0xd262d45a78a0635d,
+      0x86fb897116c87c34,
+      0xd45d35e6ae3d4da0,
+      0x8974836059cca109,
+      0x2bd1a438703fc94b,
+      0x7b6306a34627ddcf,
+      0x1a3bc84c17b1d542,
+      0x20caba5f1d9e4a93,
+      0x547eb47b7282ee9c,
+      0xe99e619a4f23aa43,
+      0x6405fa00e2ec94d4,
+      0xde83bc408dd3dd04,
+      0x9624ab50b148d445,
+      0x3badd624dd9b0957,
+      0xe54ca5d70a80e5d6,
+      0x5e9fcf4ccd211f4c,
+      0x7647c3200069671f,
+      0x29ecd9f40041e073,
+      0xf468107100525890,
+      0x7182148d4066eeb4,
+      0xc6f14cd848405530,
+      0xb8ada00e5a506a7c,
+      0xa6d90811f0e4851c,
+      0x908f4a166d1da663,
+      0x9a598e4e043287fe,
+      0x40eff1e1853f29fd,
+      0xd12bee59e68ef47c,
+      0x82bb74f8301958ce,
+      0xe36a52363c1faf01,
+      0xdc44e6c3cb279ac1,
+      0x29ab103a5ef8c0b9,
+      0x7415d448f6b6f0e7,
+      0x111b495b3464ad21,
+      0xcab10dd900beec34,
+      0x3d5d514f40eea742,
+      0xcb4a5a3112a5112,
+      0x47f0e785eaba72ab,
+      0x59ed216765690f56,
+      0x306869c13ec3532c,
+      0x1e414218c73a13fb,
+      0xe5d1929ef90898fa,
+      0xdf45f746b74abf39,
+      0x6b8bba8c328eb783,
+      0x66ea92f3f326564,
+      0xc80a537b0efefebd,
+      0xbd06742ce95f5f36,
+      0x2c48113823b73704,
+      0xf75a15862ca504c5,
+      0x9a984d73dbe722fb,
+      0xc13e60d0d2e0ebba,
+      0x318df905079926a8,
+      0xfdf17746497f7052,
+      0xfeb6ea8bedefa633,
+      0xfe64a52ee96b8fc0,
+      0x3dfdce7aa3c673b0,
+      0x6bea10ca65c084e,
+      0x486e494fcff30a62,
+      0x5a89dba3c3efccfa,
+      0xf89629465a75e01c,
+      0xf6bbb397f1135823,
+      0x746aa07ded582e2c,
+      0xa8c2a44eb4571cdc,
+      0x92f34d62616ce413,
+      0x77b020baf9c81d17,
+      0xace1474dc1d122e,
+      0xd819992132456ba,
+      0x10e1fff697ed6c69,
+      0xca8d3ffa1ef463c1,
+      0xbd308ff8a6b17cb2,
+      0xac7cb3f6d05ddbde,
+      0x6bcdf07a423aa96b,
+      0x86c16c98d2c953c6,
+      0xe871c7bf077ba8b7,
+      0x11471cd764ad4972,
+      0xd598e40d3dd89bcf,
+      0x4aff1d108d4ec2c3,
+      0xcedf722a585139ba,
+      0xc2974eb4ee658828,
+      0x733d226229feea32,
+      0x806357d5a3f525f,
+      0xca07c2dcb0cf26f7,
+      0xfc89b393dd02f0b5,
+      0xbbac2078d443ace2,
+      0xd54b944b84aa4c0d,
+      0xa9e795e65d4df11,
+      0x4d4617b5ff4a16d5,
+      0x504bced1bf8e4e45,
+      0xe45ec2862f71e1d6,
+      0x5d767327bb4e5a4c,
+      0x3a6a07f8d510f86f,
+      0x890489f70a55368b,
+      0x2b45ac74ccea842e,
+      0x3b0b8bc90012929d,
+      0x9ce6ebb40173744,
+      0xcc420a6a101d0515,
+      0x9fa946824a12232d,
+      0x47939822dc96abf9,
+      0x59787e2b93bc56f7,
+      0x57eb4edb3c55b65a,
+      0xede622920b6b23f1,
+      0xe95fab368e45eced,
+      0x11dbcb0218ebb414,
+      0xd652bdc29f26a119,
+      0x4be76d3346f0495f,
+      0x6f70a4400c562ddb,
+      0xcb4ccd500f6bb952,
+      0x7e2000a41346a7a7,
+      0x8ed400668c0c28c8,
+      0x728900802f0f32fa,
+      0x4f2b40a03ad2ffb9,
+      0xe2f610c84987bfa8,
+      0xdd9ca7d2df4d7c9,
+      0x91503d1c79720dbb,
+      0x75a44c6397ce912a,
+      0xc986afbe3ee11aba,
+      0xfbe85badce996168,
+      0xfae27299423fb9c3,
+      0xdccd879fc967d41a,
+      0x5400e987bbc1c920,
+      0x290123e9aab23b68,
+      0xf9a0b6720aaf6521,
+      0xf808e40e8d5b3e69,
+      0xb60b1d1230b20e04,
+      0xb1c6f22b5e6f48c2,
+      0x1e38aeb6360b1af3,
+      0x25c6da63c38de1b0,
+      0x579c487e5a38ad0e,
+      0x2d835a9df0c6d851,
+      0xf8e431456cf88e65,
+      0x1b8e9ecb641b58ff,
+      0xe272467e3d222f3f,
+      0x5b0ed81dcc6abb0f,
+      0x98e947129fc2b4e9,
+      0x3f2398d747b36224,
+      0x8eec7f0d19a03aad,
+      0x1953cf68300424ac,
+      0x5fa8c3423c052dd7,
+      0x3792f412cb06794d,
+      0xe2bbd88bbee40bd0,
+      0x5b6aceaeae9d0ec4,
+      0xf245825a5a445275,
+      0xeed6e2f0f0d56712,
+      0x55464dd69685606b,
+      0xaa97e14c3c26b886,
+      0xd53dd99f4b3066a8,
+      0xe546a8038efe4029,
+      0xde98520472bdd033,
+      0x963e66858f6d4440,
+      0xdde7001379a44aa8,
+      0x5560c018580d5d52,
+      0xaab8f01e6e10b4a6,
+      0xcab3961304ca70e8,
+      0x3d607b97c5fd0d22,
+      0x8cb89a7db77c506a,
+      0x77f3608e92adb242,
+      0x55f038b237591ed3,
+      0x6b6c46dec52f6688,
+      0x2323ac4b3b3da015,
+      0xabec975e0a0d081a,
+      0x96e7bd358c904a21,
+      0x7e50d64177da2e54,
+      0xdde50bd1d5d0b9e9,
+      0x955e4ec64b44e864,
+      0xbd5af13bef0b113e,
+      0xecb1ad8aeacdd58e,
+      0x67de18eda5814af2,
+      0x80eacf948770ced7,
+      0xa1258379a94d028d,
+      0x96ee45813a04330,
+      0x8bca9d6e188853fc,
+      0x775ea264cf55347d,
+      0x95364afe032a819d,
+      0x3a83ddbd83f52204,
+      0xc4926a9672793542,
+      0x75b7053c0f178293,
+      0x5324c68b12dd6338,
+      0xd3f6fc16ebca5e03,
+      0x88f4bb1ca6bcf584,
+      0x2b31e9e3d06c32e5,
+      0x3aff322e62439fcf,
+      0x9befeb9fad487c2,
+      0x4c2ebe687989a9b3,
+      0xf9d37014bf60a10,
+      0x538484c19ef38c94,
+      0x2865a5f206b06fb9,
+      0xf93f87b7442e45d3,
+      0xf78f69a51539d748,
+      0xb573440e5a884d1b,
+      0x31680a88f8953030,
+      0xfdc20d2b36ba7c3d,
+      0x3d32907604691b4c,
+      0xa63f9a49c2c1b10f,
+      0xfcf80dc33721d53,
+      0xd3c36113404ea4a8,
+      0x645a1cac083126e9,
+      0x3d70a3d70a3d70a3,
+      0xcccccccccccccccc,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x4000000000000000,
+      0x5000000000000000,
+      0xa400000000000000,
+      0x4d00000000000000,
+      0xf020000000000000,
+      0x6c28000000000000,
+      0xc732000000000000,
+      0x3c7f400000000000,
+      0x4b9f100000000000,
+      0x1e86d40000000000,
+      0x1314448000000000,
+      0x17d955a000000000,
+      0x5dcfab0800000000,
+      0x5aa1cae500000000,
+      0xf14a3d9e40000000,
+      0x6d9ccd05d0000000,
+      0xe4820023a2000000,
+      0xdda2802c8a800000,
+      0xd50b2037ad200000,
+      0x4526f422cc340000,
+      0x9670b12b7f410000,
+      0x3c0cdd765f114000,
+      0xa5880a69fb6ac800,
+      0x8eea0d047a457a00,
+      0x72a4904598d6d880,
+      0x47a6da2b7f864750,
+      0x999090b65f67d924,
+      0xfff4b4e3f741cf6d,
+      0xbff8f10e7a8921a4,
+      0xaff72d52192b6a0d,
+      0x9bf4f8a69f764490,
+      0x2f236d04753d5b4,
+      0x1d762422c946590,
+      0x424d3ad2b7b97ef5,
+      0xd2e0898765a7deb2,
+      0x63cc55f49f88eb2f,
+      0x3cbf6b71c76b25fb,
+      0x8bef464e3945ef7a,
+      0x97758bf0e3cbb5ac,
+      0x3d52eeed1cbea317,
+      0x4ca7aaa863ee4bdd,
+      0x8fe8caa93e74ef6a,
+      0xb3e2fd538e122b44,
+      0x60dbbca87196b616,
+      0xbc8955e946fe31cd,
+      0x6babab6398bdbe41,
+      0xc696963c7eed2dd1,
+      0xfc1e1de5cf543ca2,
+      0x3b25a55f43294bcb,
+      0x49ef0eb713f39ebe,
+      0x6e3569326c784337,
+      0x49c2c37f07965404,
+      0xdc33745ec97be906,
+      0x69a028bb3ded71a3,
+      0xc40832ea0d68ce0c,
+      0xf50a3fa490c30190,
+      0x792667c6da79e0fa,
+      0x577001b891185938,
+      0xed4c0226b55e6f86,
+      0x544f8158315b05b4,
+      0x696361ae3db1c721,
+      0x3bc3a19cd1e38e9,
+      0x4ab48a04065c723,
+      0x62eb0d64283f9c76,
+      0x3ba5d0bd324f8394,
+      0xca8f44ec7ee36479,
+      0x7e998b13cf4e1ecb,
+      0x9e3fedd8c321a67e,
+      0xc5cfe94ef3ea101e,
+      0xbba1f1d158724a12,
+      0x2a8a6e45ae8edc97,
+      0xf52d09d71a3293bd,
+      0x593c2626705f9c56,
+      0x6f8b2fb00c77836c,
+      0xb6dfb9c0f956447,
+      0x4724bd4189bd5eac,
+      0x58edec91ec2cb657,
+      0x2f2967b66737e3ed,
+      0xbd79e0d20082ee74,
+      0xecd8590680a3aa11,
+      0xe80e6f4820cc9495,
+      0x3109058d147fdcdd,
+      0xbd4b46f0599fd415,
+      0x6c9e18ac7007c91a,
+      0x3e2cf6bc604ddb0,
+      0x84db8346b786151c,
+      0xe612641865679a63,
+      0x4fcb7e8f3f60c07e,
+      0xe3be5e330f38f09d,
+      0x5cadf5bfd3072cc5,
+      0x73d9732fc7c8f7f6,
+      0x2867e7fddcdd9afa,
+      0xb281e1fd541501b8,
+      0x1f225a7ca91a4226,
+      0x3375788de9b06958,
+      0x52d6b1641c83ae,
+      0xc0678c5dbd23a49a,
+      0xf840b7ba963646e0,
+      0xb650e5a93bc3d898,
+      0xa3e51f138ab4cebe,
+      0xc66f336c36b10137,
+      0xb80b0047445d4184,
+      0xa60dc059157491e5,
+      0x87c89837ad68db2f,
+      0x29babe4598c311fb,
+      0xf4296dd6fef3d67a,
+      0x1899e4a65f58660c,
+      0x5ec05dcff72e7f8f,
+      0x76707543f4fa1f73,
+      0x6a06494a791c53a8,
+      0x487db9d17636892,
+      0x45a9d2845d3c42b6,
+      0xb8a2392ba45a9b2,
+      0x8e6cac7768d7141e,
+      0x3207d795430cd926,
+      0x7f44e6bd49e807b8,
+      0x5f16206c9c6209a6,
+      0x36dba887c37a8c0f,
+      0xc2494954da2c9789,
+      0xf2db9baa10b7bd6c,
+      0x6f92829494e5acc7,
+      0xcb772339ba1f17f9,
+      0xff2a760414536efb,
+      0xfef5138519684aba,
+      0x7eb258665fc25d69,
+      0xef2f773ffbd97a61,
+      0xaafb550ffacfd8fa,
+      0x95ba2a53f983cf38,
+      0xdd945a747bf26183,
+      0x94f971119aeef9e4,
+      0x7a37cd5601aab85d,
+      0xac62e055c10ab33a,
+      0x577b986b314d6009,
+      0xed5a7e85fda0b80b,
+      0x14588f13be847307,
+      0x596eb2d8ae258fc8,
+      0x6fca5f8ed9aef3bb,
+      0x25de7bb9480d5854,
+      0xaf561aa79a10ae6a,
+      0x1b2ba1518094da04,
+      0x90fb44d2f05d0842,
+      0x353a1607ac744a53,
+      0x42889b8997915ce8,
+      0x69956135febada11,
+      0x43fab9837e699095,
+      0x94f967e45e03f4bb,
+      0x1d1be0eebac278f5,
+      0x6462d92a69731732,
+      0x7d7b8f7503cfdcfe,
+      0x5cda735244c3d43e,
+      0x3a0888136afa64a7,
+      0x88aaa1845b8fdd0,
+      0x8aad549e57273d45,
+      0x36ac54e2f678864b,
+      0x84576a1bb416a7dd,
+      0x656d44a2a11c51d5,
+      0x9f644ae5a4b1b325,
+      0x873d5d9f0dde1fee,
+      0xa90cb506d155a7ea,
+      0x9a7f12442d588f2,
+      0xc11ed6d538aeb2f,
+      0x8f1668c8a86da5fa,
+      0xf96e017d694487bc,
+      0x37c981dcc395a9ac,
+      0x85bbe253f47b1417,
+      0x93956d7478ccec8e,
+      0x387ac8d1970027b2,
+      0x6997b05fcc0319e,
+      0x441fece3bdf81f03,
+      0xd527e81cad7626c3,
+      0x8a71e223d8d3b074,
+      0xf6872d5667844e49,
+      0xb428f8ac016561db,
+      0xe13336d701beba52,
+      0xecc0024661173473,
+      0x27f002d7f95d0190,
+      0x31ec038df7b441f4,
+      0x7e67047175a15271,
+      0xf0062c6e984d386,
+      0x52c07b78a3e60868,
+      0xa7709a56ccdf8a82,
+      0x88a66076400bb691,
+      0x6acff893d00ea435,
+      0x583f6b8c4124d43,
+      0xc3727a337a8b704a,
+      0x744f18c0592e4c5c,
+      0x1162def06f79df73,
+      0x8addcb5645ac2ba8,
+      0x6d953e2bd7173692,
+      0xc8fa8db6ccdd0437,
+      0x1d9c9892400a22a2,
+      0x2503beb6d00cab4b,
+      0x2e44ae64840fd61d,
+      0x5ceaecfed289e5d2,
+      0x7425a83e872c5f47,
+      0xd12f124e28f77719,
+      0x82bd6b70d99aaa6f,
+      0x636cc64d1001550b,
+      0x3c47f7e05401aa4e,
+      0x65acfaec34810a71,
+      0x7f1839a741a14d0d,
+      0x1ede48111209a050,
+      0x934aed0aab460432,
+      0xf81da84d5617853f,
+      0x36251260ab9d668e,
+      0xc1d72b7c6b426019,
+      0xb24cf65b8612f81f,
+      0xdee033f26797b627,
+      0x169840ef017da3b1,
+      0x8e1f289560ee864e,
+      0xf1a6f2bab92a27e2,
+      0xae10af696774b1db,
+      0xacca6da1e0a8ef29,
+      0x17fd090a58d32af3,
+      0xddfc4b4cef07f5b0,
+      0x4abdaf101564f98e,
+      0x9d6d1ad41abe37f1,
+      0x84c86189216dc5ed,
+      0x32fd3cf5b4e49bb4,
+      0x3fbc8c33221dc2a1,
+      0xfabaf3feaa5334a,
+      0x29cb4d87f2a7400e,
+      0x743e20e9ef511012,
+      0x914da9246b255416,
+      0x1ad089b6c2f7548e,
+      0xa184ac2473b529b1,
+      0xc9e5d72d90a2741e,
+      0x7e2fa67c7a658892,
+      0xddbb901b98feeab7,
+      0x552a74227f3ea565,
+      0xd53a88958f87275f,
+      0x8a892abaf368f137,
+      0x2d2b7569b0432d85,
+      0x9c3b29620e29fc73,
+      0x8349f3ba91b47b8f,
+      0x241c70a936219a73,
+      0xed238cd383aa0110,
+      0xf4363804324a40aa,
+      0xb143c6053edcd0d5,
+      0xdd94b7868e94050a,
+      0xca7cf2b4191c8326,
+      0xfd1c2f611f63a3f0,
+      0xbc633b39673c8cec,
+      0xd5be0503e085d813,
+      0x4b2d8644d8a74e18,
+      0xddf8e7d60ed1219e,
+      0xcabb90e5c942b503,
+      0x3d6a751f3b936243,
+      0xcc512670a783ad4,
+      0x27fb2b80668b24c5,
+      0xb1f9f660802dedf6,
+      0x5e7873f8a0396973,
+      0xdb0b487b6423e1e8,
+      0x91ce1a9a3d2cda62,
+      0x7641a140cc7810fb,
+      0xa9e904c87fcb0a9d,
+      0x546345fa9fbdcd44,
+      0xa97c177947ad4095,
+      0x49ed8eabcccc485d,
+      0x5c68f256bfff5a74,
+      0x73832eec6fff3111,
+      0xc831fd53c5ff7eab,
+      0xba3e7ca8b77f5e55,
+      0x28ce1bd2e55f35eb,
+      0x7980d163cf5b81b3,
+      0xd7e105bcc332621f,
+      0x8dd9472bf3fefaa7,
+      0xb14f98f6f0feb951,
+      0x6ed1bf9a569f33d3,
+      0xa862f80ec4700c8,
+      0xcd27bb612758c0fa,
+      0x8038d51cb897789c,
+      0xe0470a63e6bd56c3,
+      0x1858ccfce06cac74,
+      0xf37801e0c43ebc8,
+      0xd30560258f54e6ba,
+      0x47c6b82ef32a2069,
+      0x4cdc331d57fa5441,
+      0xe0133fe4adf8e952,
+      0x58180fddd97723a6,
+      0x570f09eaa7ea7648,
+  };
+
+} // namespace simdjson
+
+#endif // SIMDJSON_JSONCHARUTILS_H
+/* end file src/jsoncharutils.h */
+/* simdprune_tables.h already included: #include "simdprune_tables.h" */
+
 #if SIMDJSON_IMPLEMENTATION_ARM64
-/* begin file src/arm64/stage1_find_marks.h */
-#ifndef SIMDJSON_ARM64_STAGE1_FIND_MARKS_H
-#define SIMDJSON_ARM64_STAGE1_FIND_MARKS_H
+/* begin file src/arm64/implementation.cpp */
+/* arm64/implementation.h already included: #include "arm64/implementation.h" */
+/* begin file src/arm64/dom_parser_implementation.h */
+#ifndef SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
 
+/* isadetection.h already included: #include "isadetection.h" */
+
+namespace simdjson
+{
+  namespace arm64
+  {
+
+    /* begin file src/generic/dom_parser_implementation.h */
+    // expectation: sizeof(scope_descriptor) = 64/8.
+    struct scope_descriptor
+    {
+      uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+      uint32_t count;      // how many elements in the scope
+    };                     // struct scope_descriptor
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+    typedef void *ret_address_t;
+#else
+    typedef char ret_address_t;
+#endif
+
+    class dom_parser_implementation final : public internal::dom_parser_implementation
+    {
+    public:
+      /** Tape location of each open { or [ */
+      std::unique_ptr<scope_descriptor[]> containing_scope{};
+      /** Return address of each open { or [ */
+      std::unique_ptr<ret_address_t[]> ret_address{};
+      /** Buffer passed to stage 1 */
+      const uint8_t *buf{};
+      /** Length passed to stage 1 */
+      size_t len{0};
+      /** Document passed to stage 2 */
+      dom::document *doc{};
+      /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+      error_code error{UNINITIALIZED};
+
+      really_inline dom_parser_implementation();
+      dom_parser_implementation(const dom_parser_implementation &) = delete;
+      dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+      WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+      WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+      WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+      WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
+    };
+
+    /* begin file src/generic/stage1/allocate.h */
+    namespace stage1
+    {
+      namespace allocate
+      {
+
+        //
+        // Allocates stage 1 internal state and outputs in the parser
+        //
+        really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity)
+        {
+          size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+          parser.structural_indexes.reset(new (std::nothrow) uint32_t[max_structures]);
+          if (!parser.structural_indexes)
+          {
+            return MEMALLOC;
+          }
+          parser.structural_indexes[0] = 0;
+          parser.n_structural_indexes = 0;
+          return SUCCESS;
+        }
+
+      } // namespace allocate
+    }   // namespace stage1
+    /* end file src/generic/stage1/allocate.h */
+    /* begin file src/generic/stage2/allocate.h */
+    namespace stage2
+    {
+      namespace allocate
+      {
+
+        //
+        // Allocates stage 2 internal state and outputs in the parser
+        //
+        really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth)
+        {
+          parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+          parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
+
+          if (!parser.ret_address || !parser.containing_scope)
+          {
+            return MEMALLOC;
+          }
+          return SUCCESS;
+        }
+
+      } // namespace allocate
+    }   // namespace stage2
+    /* end file src/generic/stage2/allocate.h */
+
+    really_inline dom_parser_implementation::dom_parser_implementation() {}
+
+    // Leaving these here so they can be inlined if so desired
+    WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept
+    {
+      error_code err = stage1::allocate::set_capacity(*this, capacity);
+      if (err)
+      {
+        _capacity = 0;
+        return err;
+      }
+      _capacity = capacity;
+      return SUCCESS;
+    }
+
+    WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept
+    {
+      error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+      if (err)
+      {
+        _max_depth = 0;
+        return err;
+      }
+      _max_depth = max_depth;
+      return SUCCESS;
+    }
+    /* end file src/generic/stage2/allocate.h */
+
+  } // namespace arm64
+} // namespace simdjson
+
+#endif // SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
+
+TARGET_HASWELL
+
+namespace simdjson
+{
+  namespace arm64
+  {
+
+    WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+        size_t capacity,
+        size_t max_depth,
+        std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept
+    {
+      dst.reset(new (std::nothrow) dom_parser_implementation());
+      if (!dst)
+      {
+        return MEMALLOC;
+      }
+      dst->set_capacity(capacity);
+      dst->set_max_depth(max_depth);
+      return SUCCESS;
+    }
+
+  } // namespace arm64
+} // namespace simdjson
+
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/arm64/dom_parser_implementation.cpp */
+/* arm64/implementation.h already included: #include "arm64/implementation.h" */
+/* arm64/dom_parser_implementation.h already included: #include "arm64/dom_parser_implementation.h" */
+
+//
+// Stage 1
+//
 /* begin file src/arm64/bitmask.h */
 #ifndef SIMDJSON_ARM64_BITMASK_H
 #define SIMDJSON_ARM64_BITMASK_H
 
-
 /* begin file src/arm64/intrinsics.h */
 #ifndef SIMDJSON_ARM64_INTRINSICS_H
 #define SIMDJSON_ARM64_INTRINSICS_H
 
-
 // This should be the correct header whether
 // you use visual studio or other compilers.
 #include <arm_neon.h>
 
 #endif //  SIMDJSON_ARM64_INTRINSICS_H
 /* end file src/arm64/intrinsics.h */
 
-namespace simdjson::arm64 {
+namespace simdjson
+{
+  namespace arm64
+  {
 
-//
-// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
-//
-// For example, prefix_xor(00100100) == 00011100
-//
-really_inline uint64_t prefix_xor(uint64_t bitmask) {
-  /////////////
-  // We could do this with PMULL, but it is apparently slow.
-  //  
-  //#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
-  //return vmull_p64(-1ULL, bitmask);
-  //#else
-  // Analysis by @sebpop:
-  // When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
-  // in between other vector code, so effectively the extra cycles of the sequence do not matter 
-  // because the GPR units are idle otherwise and the critical path is on the FP side.
-  // Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 ) 
-  // and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
-  ///////////
-  bitmask ^= bitmask << 1;
-  bitmask ^= bitmask << 2;
-  bitmask ^= bitmask << 4;
-  bitmask ^= bitmask << 8;
-  bitmask ^= bitmask << 16;
-  bitmask ^= bitmask << 32;
-  return bitmask;
-}
+    //
+    // Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+    //
+    // For example, prefix_xor(00100100) == 00011100
+    //
+    really_inline uint64_t prefix_xor(uint64_t bitmask)
+    {
+      /////////////
+      // We could do this with PMULL, but it is apparently slow.
+      //
+      //#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
+      //return vmull_p64(-1ULL, bitmask);
+      //#else
+      // Analysis by @sebpop:
+      // When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
+      // in between other vector code, so effectively the extra cycles of the sequence do not matter
+      // because the GPR units are idle otherwise and the critical path is on the FP side.
+      // Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 )
+      // and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
+      ///////////
+      bitmask ^= bitmask << 1;
+      bitmask ^= bitmask << 2;
+      bitmask ^= bitmask << 4;
+      bitmask ^= bitmask << 8;
+      bitmask ^= bitmask << 16;
+      bitmask ^= bitmask << 32;
+      return bitmask;
+    }
 
-} // namespace simdjson::arm64
+  } // namespace arm64
+} // namespace simdjson
 UNTARGET_REGION
 
 #endif
 /* end file src/arm64/intrinsics.h */
 /* begin file src/arm64/simd.h */
@@ -554,758 +2264,1019 @@
 #ifndef SIMDJSON_ARM64_BITMANIPULATION_H
 #define SIMDJSON_ARM64_BITMANIPULATION_H
 
 /* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
 
-namespace simdjson::arm64 {
+namespace simdjson
+{
+  namespace arm64
+  {
 
-#ifndef _MSC_VER
-// We sometimes call trailing_zero on inputs that are zero,
-// but the algorithms do not end up using the returned value.
-// Sadly, sanitizers are not smart enough to figure it out. 
-__attribute__((no_sanitize("undefined"))) // this is deliberate
-#endif // _MSC_VER
-/* result might be undefined when input_num is zero */
-really_inline int trailing_zeroes(uint64_t input_num) {
+    // We sometimes call trailing_zero on inputs that are zero,
+    // but the algorithms do not end up using the returned value.
+    // Sadly, sanitizers are not smart enough to figure it out.
+    NO_SANITIZE_UNDEFINED
+    really_inline int trailing_zeroes(uint64_t input_num)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      unsigned long ret;
+      // Search the mask data from least significant bit (LSB)
+      // to the most significant bit (MSB) for a set bit (1).
+      _BitScanForward64(&ret, input_num);
+      return (int)ret;
+#else  // SIMDJSON_REGULAR_VISUAL_STUDIO
+      return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+    }
 
-#ifdef _MSC_VER
-  unsigned long ret;
-  // Search the mask data from least significant bit (LSB) 
-  // to the most significant bit (MSB) for a set bit (1).
-  _BitScanForward64(&ret, input_num);
-  return (int)ret;
-#else
-  return __builtin_ctzll(input_num);
-#endif // _MSC_VER
+    /* result might be undefined when input_num is zero */
+    really_inline uint64_t clear_lowest_bit(uint64_t input_num)
+    {
+      return input_num & (input_num - 1);
+    }
 
-} // namespace simdjson::arm64
-
-/* result might be undefined when input_num is zero */
-really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
-  return input_num & (input_num-1);
-}
-
-/* result might be undefined when input_num is zero */
-really_inline int leading_zeroes(uint64_t input_num) {
-#ifdef _MSC_VER
-  unsigned long leading_zero = 0;
-  // Search the mask data from most significant bit (MSB) 
-  // to least significant bit (LSB) for a set bit (1).
-  if (_BitScanReverse64(&leading_zero, input_num))
-    return (int)(63 - leading_zero);
-  else
-    return 64;
+    /* result might be undefined when input_num is zero */
+    really_inline int leading_zeroes(uint64_t input_num)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      unsigned long leading_zero = 0;
+      // Search the mask data from most significant bit (MSB)
+      // to least significant bit (LSB) for a set bit (1).
+      if (_BitScanReverse64(&leading_zero, input_num))
+        return (int)(63 - leading_zero);
+      else
+        return 64;
 #else
-  return __builtin_clzll(input_num);
-#endif// _MSC_VER
-}
+      return __builtin_clzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+    }
 
-/* result might be undefined when input_num is zero */
-really_inline int count_ones(uint64_t input_num) {
-   return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
-}
+    /* result might be undefined when input_num is zero */
+    really_inline int count_ones(uint64_t input_num)
+    {
+      return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+    }
 
-really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
-#ifdef _MSC_VER
-  // todo: this might fail under visual studio for ARM
-  return _addcarry_u64(0, value1, value2,
-                       reinterpret_cast<unsigned __int64 *>(result));
+    really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      *result = value1 + value2;
+      return *result < value1;
 #else
-  return __builtin_uaddll_overflow(value1, value2,
-                                   (unsigned long long *)result);
+      return __builtin_uaddll_overflow(value1, value2,
+                                       (unsigned long long *)result);
 #endif
-}
+    }
 
-#ifdef _MSC_VER
-#pragma intrinsic(_umul128) // todo: this might fail under visual studio for ARM
-#endif
-
-really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
-#ifdef _MSC_VER
-  // todo: this might fail under visual studio for ARM
-  uint64_t high;
-  *result = _umul128(value1, value2, &high);
-  return high;
+    really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      *result = value1 * value2;
+      return !!__umulh(value1, value2);
 #else
-  return __builtin_umulll_overflow(value1, value2, (unsigned long long *)result);
+      return __builtin_umulll_overflow(value1, value2, (unsigned long long *)result);
 #endif
-}
+    }
 
-} // namespace simdjson::arm64
+  } // namespace arm64
+} // namespace simdjson
 
 #endif // SIMDJSON_ARM64_BITMANIPULATION_H
 /* end file src/arm64/bitmanipulation.h */
 /* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
+#include <type_traits>
 
-namespace simdjson::arm64::simd {
+namespace simdjson
+{
+  namespace arm64
+  {
+    namespace simd
+    {
 
-  template<typename T>
-  struct simd8;
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      namespace
+      {
+        // Start of private section with Visual Studio workaround
 
-  //
-  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
-  //
-  template<typename T, typename Mask=simd8<bool>>
-  struct base_u8 {
-    uint8x16_t value;
-    static const int SIZE = sizeof(value);
+        /**
+ * make_uint8x16_t initializes a SIMD register (uint8x16_t).
+ * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
+ * is not recognized under Visual Studio! This is a workaround.
+ * Using a std::initializer_list<uint8_t>  as a parameter resulted in
+ * inefficient code. With the current approach, if the parameters are
+ * compile-time constants,
+ * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
+ * You should not use this function except for compile-time constants:
+ * it is not efficient.
+ */
+        really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
+                                                 uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
+                                                 uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
+                                                 uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16)
+        {
+          // Doing a load like so end ups generating worse code.
+          // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+          //                     x9, x10,x11,x12,x13,x14,x15,x16};
+          // return vld1q_u8(array);
+          uint8x16_t x{};
+          // incredibly, Visual Studio does not allow x[0] = x1
+          x = vsetq_lane_u8(x1, x, 0);
+          x = vsetq_lane_u8(x2, x, 1);
+          x = vsetq_lane_u8(x3, x, 2);
+          x = vsetq_lane_u8(x4, x, 3);
+          x = vsetq_lane_u8(x5, x, 4);
+          x = vsetq_lane_u8(x6, x, 5);
+          x = vsetq_lane_u8(x7, x, 6);
+          x = vsetq_lane_u8(x8, x, 7);
+          x = vsetq_lane_u8(x9, x, 8);
+          x = vsetq_lane_u8(x10, x, 9);
+          x = vsetq_lane_u8(x11, x, 10);
+          x = vsetq_lane_u8(x12, x, 11);
+          x = vsetq_lane_u8(x13, x, 12);
+          x = vsetq_lane_u8(x14, x, 13);
+          x = vsetq_lane_u8(x15, x, 14);
+          x = vsetq_lane_u8(x16, x, 15);
+          return x;
+        }
 
-    // Conversion from/to SIMD register
-    really_inline base_u8(const uint8x16_t _value) : value(_value) {}
-    really_inline operator const uint8x16_t&() const { return this->value; }
-    really_inline operator uint8x16_t&() { return this->value; }
+        // We have to do the same work for make_int8x16_t
+        really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
+                                               int8_t x5, int8_t x6, int8_t x7, int8_t x8,
+                                               int8_t x9, int8_t x10, int8_t x11, int8_t x12,
+                                               int8_t x13, int8_t x14, int8_t x15, int8_t x16)
+        {
+          // Doing a load like so end ups generating worse code.
+          // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+          //                     x9, x10,x11,x12,x13,x14,x15,x16};
+          // return vld1q_s8(array);
+          int8x16_t x{};
+          // incredibly, Visual Studio does not allow x[0] = x1
+          x = vsetq_lane_s8(x1, x, 0);
+          x = vsetq_lane_s8(x2, x, 1);
+          x = vsetq_lane_s8(x3, x, 2);
+          x = vsetq_lane_s8(x4, x, 3);
+          x = vsetq_lane_s8(x5, x, 4);
+          x = vsetq_lane_s8(x6, x, 5);
+          x = vsetq_lane_s8(x7, x, 6);
+          x = vsetq_lane_s8(x8, x, 7);
+          x = vsetq_lane_s8(x9, x, 8);
+          x = vsetq_lane_s8(x10, x, 9);
+          x = vsetq_lane_s8(x11, x, 10);
+          x = vsetq_lane_s8(x12, x, 11);
+          x = vsetq_lane_s8(x13, x, 12);
+          x = vsetq_lane_s8(x14, x, 13);
+          x = vsetq_lane_s8(x15, x, 14);
+          x = vsetq_lane_s8(x16, x, 15);
+          return x;
+        }
 
-    // Bit operations
-    really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
-    really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
-    really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
-    really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
-    really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
-    really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast | other; return *this_cast; }
-    really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast & other; return *this_cast; }
-    really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast ^ other; return *this_cast; }
+        // End of private section with Visual Studio workaround
+      } // namespace
+#endif  // SIMDJSON_REGULAR_VISUAL_STUDIO
 
-    really_inline Mask operator==(const simd8<T> other) const { return vceqq_u8(*this, other); }
+      template <typename T>
+      struct simd8;
 
-    template<int N=1>
-    really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return vextq_u8(prev_chunk, *this, 16 - N);
-    }
-  };
+      //
+      // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
+      //
+      template <typename T, typename Mask = simd8<bool>>
+      struct base_u8
+      {
+        uint8x16_t value;
+        static const int SIZE = sizeof(value);
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base_u8<bool> {
-    typedef uint16_t bitmask_t;
-    typedef uint32_t bitmask2_t;
+        // Conversion from/to SIMD register
+        really_inline base_u8(const uint8x16_t _value) : value(_value) {}
+        really_inline operator const uint8x16_t &() const { return this->value; }
+        really_inline operator uint8x16_t &() { return this->value; }
 
-    static really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(-(!!_value)); }
+        // Bit operations
+        really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
+        really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
+        really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
+        really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
+        really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+        really_inline simd8<T> &operator|=(const simd8<T> other)
+        {
+          auto this_cast = (simd8<T> *)this;
+          *this_cast = *this_cast | other;
+          return *this_cast;
+        }
+        really_inline simd8<T> &operator&=(const simd8<T> other)
+        {
+          auto this_cast = (simd8<T> *)this;
+          *this_cast = *this_cast & other;
+          return *this_cast;
+        }
+        really_inline simd8<T> &operator^=(const simd8<T> other)
+        {
+          auto this_cast = (simd8<T> *)this;
+          *this_cast = *this_cast ^ other;
+          return *this_cast;
+        }
 
-    really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
-    // False constructor
-    really_inline simd8() : simd8(vdupq_n_u8(0)) {}
-    // Splat constructor
-    really_inline simd8(bool _value) : simd8(splat(_value)) {}
+        really_inline Mask operator==(const simd8<T> other) const { return vceqq_u8(*this, other); }
 
-    // We return uint32_t instead of uint16_t because that seems to be more efficient for most
-    // purposes (cutting it down to uint16_t costs performance in some compilers).
-    really_inline uint32_t to_bitmask() const {
-      const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-      auto minput = *this & bit_mask;
-      uint8x16_t tmp = vpaddq_u8(minput, minput);
-      tmp = vpaddq_u8(tmp, tmp);
-      tmp = vpaddq_u8(tmp, tmp);
-      return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
-    }
-    really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
-  };
+        template <int N = 1>
+        really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+        {
+          return vextq_u8(prev_chunk, *this, 16 - N);
+        }
+      };
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base_u8<uint8_t> {
-    static really_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
-    static really_inline uint8x16_t zero() { return vdupq_n_u8(0); }
-    static really_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
+      // SIMD byte mask type (returned by things like eq and gt)
+      template <>
+      struct simd8<bool> : base_u8<bool>
+      {
+        typedef uint16_t bitmask_t;
+        typedef uint32_t bitmask2_t;
 
-    really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
-    // Zero constructor
-    really_inline simd8() : simd8(zero()) {}
-    // Array constructor
-    really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
-    // Splat constructor
-    really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
-    // Member-by-member initialization
-    really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(uint8x16_t{
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    }) {}
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+        static really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
 
-    // Store to array
-    really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
+        really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
+        // False constructor
+        really_inline simd8() : simd8(vdupq_n_u8(0)) {}
+        // Splat constructor
+        really_inline simd8(bool _value) : simd8(splat(_value)) {}
 
-    // Saturated math
-    really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
-    really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
+        // We return uint32_t instead of uint16_t because that seems to be more efficient for most
+        // purposes (cutting it down to uint16_t costs performance in some compilers).
+        really_inline uint32_t to_bitmask() const
+        {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+          const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                                      0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+          const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                       0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+          auto minput = *this & bit_mask;
+          uint8x16_t tmp = vpaddq_u8(minput, minput);
+          tmp = vpaddq_u8(tmp, tmp);
+          tmp = vpaddq_u8(tmp, tmp);
+          return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+        }
+        really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
+      };
 
-    // Addition/subtraction are the same for signed and unsigned
-    really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
-    really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
-    really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
-    really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
+      // Unsigned bytes
+      template <>
+      struct simd8<uint8_t> : base_u8<uint8_t>
+      {
+        static really_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
+        static really_inline uint8x16_t zero() { return vdupq_n_u8(0); }
+        static really_inline uint8x16_t load(const uint8_t *values) { return vld1q_u8(values); }
 
-    // Order-specific operations
-    really_inline uint8_t max() const { return vmaxvq_u8(*this); }
-    really_inline uint8_t min() const { return vminvq_u8(*this); }
-    really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
-    really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
-    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
-    really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
-    really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
-    really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
-    // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
-    really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
-    // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
-    really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
+        really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
+        // Zero constructor
+        really_inline simd8() : simd8(zero()) {}
+        // Array constructor
+        really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+        // Splat constructor
+        really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+        // Member-by-member initialization
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+        really_inline simd8(
+            uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+            uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) : simd8(make_uint8x16_t(v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                                          v8, v9, v10, v11, v12, v13, v14, v15))
+        {
+        }
+#else
+        really_inline simd8(
+            uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+            uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) : simd8(uint8x16_t{
+                                                                                                                        v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                        v8, v9, v10, v11, v12, v13, v14, v15})
+        {
+        }
+#endif
 
-    // Bit-specific operations
-    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
-    really_inline bool any_bits_set_anywhere() const { return this->max() != 0; }
-    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
-    template<int N>
-    really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
-    template<int N>
-    really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        really_inline static simd8<uint8_t> repeat_16(
+            uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+            uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        {
+          return simd8<uint8_t>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
+        }
 
-    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
-    template<typename L>
-    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return lookup_table.apply_lookup_16_to(*this);
-    }
+        // Store to array
+        really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
 
+        // Saturated math
+        really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
+        really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
 
-    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
-    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
-    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
-    // get written.
-    // Design consideration: it seems like a function with the
-    // signature simd8<L> compress(uint16_t mask) would be
-    // sensible, but the AVX ISA makes this kind of approach difficult.
-    template<typename L>
-    really_inline void compress(uint16_t mask, L * output) const {
-      // this particular implementation was inspired by work done by @animetosho
-      // we do it in two steps, first 8 bytes and then second 8 bytes
-      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
-      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
-      // next line just loads the 64-bit values thintable_epi8[mask1] and
-      // thintable_epi8[mask2] into a 128-bit register, using only
-      // two instructions on most compilers.
-      uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
-      uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
-      // we increment by 0x08 the second half of the mask
-      uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
-      shufmask = vaddq_u8(shufmask, inc);
-      // this is the version "nearly pruned"
-      uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
-      // we still need to put the two halves together.
-      // we compute the popcount of the first half:
-      int pop1 = BitsSetTable256mul2[mask1];
-      // then load the corresponding mask, what it does is to write
-      // only the first pop1 bytes from the first 8 bytes, and then
-      // it fills in with the bytes from the second 8 bytes + some filling
-      // at the end.
-      uint8x16_t compactmask = vld1q_u8((const uint8_t *)(pshufb_combine_table + pop1 * 8));
-      uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
-      vst1q_u8((uint8_t*) output, answer);
-    }
+        // Addition/subtraction are the same for signed and unsigned
+        really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
+        really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
+        really_inline simd8<uint8_t> &operator+=(const simd8<uint8_t> other)
+        {
+          *this = *this + other;
+          return *this;
+        }
+        really_inline simd8<uint8_t> &operator-=(const simd8<uint8_t> other)
+        {
+          *this = *this - other;
+          return *this;
+        }
 
-    template<typename L>
-    really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
+        // Order-specific operations
+        really_inline uint8_t max() const { return vmaxvq_u8(*this); }
+        really_inline uint8_t min() const { return vminvq_u8(*this); }
+        really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
+        really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
+        really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
+        really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
+        really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
+        really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
+        // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
+        really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
+        // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
+        really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
 
-    template<typename T>
-    really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
-      return vqtbl1q_u8(*this, simd8<uint8_t>(original));
-    }
-  };
+        // Bit-specific operations
+        really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
+        really_inline bool any_bits_set_anywhere() const { return this->max() != 0; }
+        really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
+        template <int N>
+        really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
+        template <int N>
+        really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
 
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> {
-    int8x16_t value;
+        // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+        template <typename L>
+        really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+        {
+          return lookup_table.apply_lookup_16_to(*this);
+        }
 
-    static really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
-    static really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
-    static really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
+        // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+        // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+        // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+        // get written.
+        // Design consideration: it seems like a function with the
+        // signature simd8<L> compress(uint16_t mask) would be
+        // sensible, but the AVX ISA makes this kind of approach difficult.
+        template <typename L>
+        really_inline void compress(uint16_t mask, L *output) const
+        {
+          // this particular implementation was inspired by work done by @animetosho
+          // we do it in two steps, first 8 bytes and then second 8 bytes
+          uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+          uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+          // next line just loads the 64-bit values thintable_epi8[mask1] and
+          // thintable_epi8[mask2] into a 128-bit register, using only
+          // two instructions on most compilers.
+          uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
+          uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
+          // we increment by 0x08 the second half of the mask
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+          uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
+#else
+          uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
+#endif
+          shufmask = vaddq_u8(shufmask, inc);
+          // this is the version "nearly pruned"
+          uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
+          // we still need to put the two halves together.
+          // we compute the popcount of the first half:
+          int pop1 = BitsSetTable256mul2[mask1];
+          // then load the corresponding mask, what it does is to write
+          // only the first pop1 bytes from the first 8 bytes, and then
+          // it fills in with the bytes from the second 8 bytes + some filling
+          // at the end.
+          uint8x16_t compactmask = vld1q_u8((const uint8_t *)(pshufb_combine_table + pop1 * 8));
+          uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+          vst1q_u8((uint8_t *)output, answer);
+        }
 
-    // Conversion from/to SIMD register
-    really_inline simd8(const int8x16_t _value) : value{_value} {}
-    really_inline operator const int8x16_t&() const { return this->value; }
-    really_inline operator int8x16_t&() { return this->value; }
+        template <typename L>
+        really_inline simd8<L> lookup_16(
+            L replace0, L replace1, L replace2, L replace3,
+            L replace4, L replace5, L replace6, L replace7,
+            L replace8, L replace9, L replace10, L replace11,
+            L replace12, L replace13, L replace14, L replace15) const
+        {
+          return lookup_16(simd8<L>::repeat_16(
+              replace0, replace1, replace2, replace3,
+              replace4, replace5, replace6, replace7,
+              replace8, replace9, replace10, replace11,
+              replace12, replace13, replace14, replace15));
+        }
 
-    // Zero constructor
-    really_inline simd8() : simd8(zero()) {}
-    // Splat constructor
-    really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
-    // Array constructor
-    really_inline simd8(const int8_t* values) : simd8(load(values)) {}
-    // Member-by-member initialization
-    really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(int8x16_t{
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    }) {}
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+        template <typename T>
+        really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original)
+        {
+          return vqtbl1q_u8(*this, simd8<uint8_t>(original));
+        }
+      };
 
-    // Store to array
-    really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
+      // Signed bytes
+      template <>
+      struct simd8<int8_t>
+      {
+        int8x16_t value;
 
-    // Explicit conversion to/from unsigned
-    really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
-    really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
+        static really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
+        static really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
+        static really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
 
-    // Math
-    really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
-    really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
-    really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
-    really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
+        // Conversion from/to SIMD register
+        really_inline simd8(const int8x16_t _value) : value{_value} {}
+        really_inline operator const int8x16_t &() const { return this->value; }
+        really_inline operator int8x16_t &() { return this->value; }
 
-    // Order-sensitive comparisons
-    really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
-    really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
-    really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
-    really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
-    really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
+        // Zero constructor
+        really_inline simd8() : simd8(zero()) {}
+        // Splat constructor
+        really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+        // Array constructor
+        really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+        // Member-by-member initialization
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+        really_inline simd8(
+            int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+            int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15) : simd8(make_int8x16_t(v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                                 v8, v9, v10, v11, v12, v13, v14, v15))
+        {
+        }
+#else
+        really_inline simd8(
+            int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+            int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15) : simd8(int8x16_t{
+                                                                                                                v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                v8, v9, v10, v11, v12, v13, v14, v15})
+        {
+        }
+#endif
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        really_inline static simd8<int8_t> repeat_16(
+            int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+            int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        {
+          return simd8<int8_t>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
+        }
 
-    template<int N=1>
-    really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
-      return vextq_s8(prev_chunk, *this, 16 - N);
-    }
+        // Store to array
+        really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
 
-    // Perform a lookup assuming no value is larger than 16
-    template<typename L>
-    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return lookup_table.apply_lookup_16_to(*this);
-    }
-    template<typename L>
-    really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
+        // Explicit conversion to/from unsigned
+        //
+        // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
+        // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
+        // and relatively ugly and hard to read.
+#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
+        really_inline explicit simd8(const uint8x16_t other) : simd8(vreinterpretq_s8_u8(other))
+        {
+        }
+#endif
+        really_inline explicit operator simd8<uint8_t>() const
+        {
+          return vreinterpretq_u8_s8(this->value);
+        }
 
-    template<typename T>
-    really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
-      return vqtbl1q_s8(*this, simd8<uint8_t>(original));
-    }
-  };
+        // Math
+        really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
+        really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
+        really_inline simd8<int8_t> &operator+=(const simd8<int8_t> other)
+        {
+          *this = *this + other;
+          return *this;
+        }
+        really_inline simd8<int8_t> &operator-=(const simd8<int8_t> other)
+        {
+          *this = *this - other;
+          return *this;
+        }
 
-  template<typename T>
-  struct simd8x64 {
-    static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
-    const simd8<T> chunks[NUM_CHUNKS];
+        // Order-sensitive comparisons
+        really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
+        really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
+        really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
+        really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
+        really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
 
-    really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
-    really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
+        template <int N = 1>
+        really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const
+        {
+          return vextq_s8(prev_chunk, *this, 16 - N);
+        }
 
-    really_inline void store(T ptr[64]) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
-    }
+        // Perform a lookup assuming no value is larger than 16
+        template <typename L>
+        really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+        {
+          return lookup_table.apply_lookup_16_to(*this);
+        }
+        template <typename L>
+        really_inline simd8<L> lookup_16(
+            L replace0, L replace1, L replace2, L replace3,
+            L replace4, L replace5, L replace6, L replace7,
+            L replace8, L replace9, L replace10, L replace11,
+            L replace12, L replace13, L replace14, L replace15) const
+        {
+          return lookup_16(simd8<L>::repeat_16(
+              replace0, replace1, replace2, replace3,
+              replace4, replace5, replace6, replace7,
+              replace8, replace9, replace10, replace11,
+              replace12, replace13, replace14, replace15));
+        }
 
-    really_inline void compress(uint64_t mask, T * output) const {
-      this->chunks[0].compress(mask, output);
-      this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
-      this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
-      this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
-    }
+        template <typename T>
+        really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original)
+        {
+          return vqtbl1q_s8(*this, simd8<uint8_t>(original));
+        }
+      };
 
-    template <typename F>
-    static really_inline void each_index(F const& each) {
-      each(0);
-      each(1);
-      each(2);
-      each(3);
-    }
+      template <typename T>
+      struct simd8x64
+      {
+        static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+        const simd8<T> chunks[NUM_CHUNKS];
 
-    template <typename F>
-    really_inline void each(F const& each_chunk) const
-    {
-      each_chunk(this->chunks[0]);
-      each_chunk(this->chunks[1]);
-      each_chunk(this->chunks[2]);
-      each_chunk(this->chunks[3]);
-    }
+        really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
+        really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+        really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16), simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
 
-    template <typename R=bool, typename F>
-    really_inline simd8x64<R> map(F const& map_chunk) const {
-      return simd8x64<R>(
-        map_chunk(this->chunks[0]),
-        map_chunk(this->chunks[1]),
-        map_chunk(this->chunks[2]),
-        map_chunk(this->chunks[3])
-      );
-    }
+        really_inline void store(T ptr[64]) const
+        {
+          this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
+          this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
+          this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
+          this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
+        }
 
-    template <typename R=bool, typename F>
-    really_inline simd8x64<R> map(const simd8x64<T> b, F const& map_chunk) const {
-      return simd8x64<R>(
-        map_chunk(this->chunks[0], b.chunks[0]),
-        map_chunk(this->chunks[1], b.chunks[1]),
-        map_chunk(this->chunks[2], b.chunks[2]),
-        map_chunk(this->chunks[3], b.chunks[3])
-      );
-    }
+        really_inline void compress(uint64_t mask, T *output) const
+        {
+          this->chunks[0].compress(uint16_t(mask), output);
+          this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
+          this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
+          this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+        }
 
-    template <typename F>
-    really_inline simd8<T> reduce(F const& reduce_pair) const {
-      return reduce_pair(
-        reduce_pair(this->chunks[0], this->chunks[1]),
-        reduce_pair(this->chunks[2], this->chunks[3])
-      );
-    }
+        template <typename F>
+        static really_inline void each_index(F const &each)
+        {
+          each(0);
+          each(1);
+          each(2);
+          each(3);
+        }
 
-    really_inline uint64_t to_bitmask() const {
-      const uint8x16_t bit_mask = {
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      };
-      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
-      uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
-      uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
-      sum0 = vpaddq_u8(sum0, sum1);
-      sum0 = vpaddq_u8(sum0, sum0);
-      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-    }
+        really_inline uint64_t to_bitmask() const
+        {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+          const uint8x16_t bit_mask = make_uint8x16_t(
+              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+          const uint8x16_t bit_mask = {
+              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+          // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+          uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
+          uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
+          sum0 = vpaddq_u8(sum0, sum1);
+          sum0 = vpaddq_u8(sum0, sum0);
+          return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+        }
 
-    really_inline simd8x64<T> bit_or(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a | mask; } );
-    }
+        really_inline simd8x64<T> bit_or(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<T>(
+              this->chunks[0] | mask,
+              this->chunks[1] | mask,
+              this->chunks[2] | mask,
+              this->chunks[3] | mask);
+        }
 
-    really_inline uint64_t eq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a == mask; } ).to_bitmask();
-    }
+        really_inline uint64_t eq(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<bool>(
+                     this->chunks[0] == mask,
+                     this->chunks[1] == mask,
+                     this->chunks[2] == mask,
+                     this->chunks[3] == mask)
+              .to_bitmask();
+        }
 
-    really_inline uint64_t lteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
-    }
-  }; // struct simd8x64<T>
+        really_inline uint64_t lteq(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<bool>(
+                     this->chunks[0] <= mask,
+                     this->chunks[1] <= mask,
+                     this->chunks[2] <= mask,
+                     this->chunks[3] <= mask)
+              .to_bitmask();
+        }
+      }; // struct simd8x64<T>
 
-} // namespace simdjson::arm64::simd
+    } // namespace simd
+  }   // namespace arm64
+} // namespace simdjson
 
 #endif // SIMDJSON_ARM64_SIMD_H
 /* end file src/arm64/bitmanipulation.h */
 /* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */
-/* arm64/implementation.h already included: #include "arm64/implementation.h" */
 
-namespace simdjson::arm64 {
+namespace simdjson
+{
+  namespace arm64
+  {
 
-using namespace simd;
+    using namespace simd;
 
-struct json_character_block {
-  static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
+    struct json_character_block
+    {
+      static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
 
-  really_inline uint64_t whitespace() const { return _whitespace; }
-  really_inline uint64_t op() const { return _op; }
-  really_inline uint64_t scalar() { return ~(op() | whitespace()); }
+      really_inline uint64_t whitespace() const { return _whitespace; }
+      really_inline uint64_t op() const { return _op; }
+      really_inline uint64_t scalar() { return ~(op() | whitespace()); }
 
-  uint64_t _whitespace;
-  uint64_t _op;
-};
+      uint64_t _whitespace;
+      uint64_t _op;
+    };
 
-really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
-  auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
-    auto nib_lo = chunk & 0xf;
-    auto nib_hi = chunk.shr<4>();
-    auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-    auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
-    return shuf_lo & shuf_hi;
-  });
+    really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in)
+    {
+      // Functional programming causes trouble with Visual Studio.
+      // Keeping this version in comments since it is much nicer:
+      // auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
+      //  auto nib_lo = chunk & 0xf;
+      //  auto nib_hi = chunk.shr<4>();
+      //  auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+      //  auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+      //  return shuf_lo & shuf_hi;
+      // });
+      const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+      const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
 
+      auto v = simd8x64<uint8_t>(
+          (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
+          (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
+          (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
+          (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2));
 
-  // We compute whitespace and op separately. If the code later only use one or the
-  // other, given the fact that all functions are aggressively inlined, we can
-  // hope that useless computations will be omitted. This is namely case when
-  // minifying (we only need whitespace). *However* if we only need spaces,
-  // it is likely that we will still compute 'v' above with two lookup_16: one
-  // could do it a bit cheaper. This is in contrast with the x64 implementations
-  // where we can, efficiently, do the white space and structural matching
-  // separately. One reason for this difference is that on ARM NEON, the table
-  // lookups either zero or leave unchanged the characters exceeding 0xF whereas
-  // on x64, the equivalent instruction (pshufb) automatically applies a mask,
-  // ignoring the 4 most significant bits. Thus the x64 implementation is
-  // optimized differently. This being said, if you use this code strictly
-  // just for minification (or just to identify the structural characters),
-  // there is a small untaken optimization opportunity here. We deliberately
-  // do not pick it up.
+      // We compute whitespace and op separately. If the code later only use one or the
+      // other, given the fact that all functions are aggressively inlined, we can
+      // hope that useless computations will be omitted. This is namely case when
+      // minifying (we only need whitespace). *However* if we only need spaces,
+      // it is likely that we will still compute 'v' above with two lookup_16: one
+      // could do it a bit cheaper. This is in contrast with the x64 implementations
+      // where we can, efficiently, do the white space and structural matching
+      // separately. One reason for this difference is that on ARM NEON, the table
+      // lookups either zero or leave unchanged the characters exceeding 0xF whereas
+      // on x64, the equivalent instruction (pshufb) automatically applies a mask,
+      // ignoring the 4 most significant bits. Thus the x64 implementation is
+      // optimized differently. This being said, if you use this code strictly
+      // just for minification (or just to identify the structural characters),
+      // there is a small untaken optimization opportunity here. We deliberately
+      // do not pick it up.
 
-  uint64_t op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
-  uint64_t whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
-  return { whitespace, op };
-}
+      uint64_t op = simd8x64<bool>(
+                        v.chunks[0].any_bits_set(0x7),
+                        v.chunks[1].any_bits_set(0x7),
+                        v.chunks[2].any_bits_set(0x7),
+                        v.chunks[3].any_bits_set(0x7))
+                        .to_bitmask();
 
-really_inline bool is_ascii(simd8x64<uint8_t> input) {
-    simd8<uint8_t> bits = input.reduce([&](auto a,auto b) { return a|b; });
-    return bits.max() < 0b10000000u;
-}
+      uint64_t whitespace = simd8x64<bool>(
+                                v.chunks[0].any_bits_set(0x18),
+                                v.chunks[1].any_bits_set(0x18),
+                                v.chunks[2].any_bits_set(0x18),
+                                v.chunks[3].any_bits_set(0x18))
+                                .to_bitmask();
 
-really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
-    simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
-    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
-    simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
-    // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
-    // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
-    // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
-    // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
-    // The error will be detected there.
-    return is_second_byte ^ is_third_byte ^ is_fourth_byte;
-}
+      return {whitespace, op};
+    }
 
-/* begin file src/generic/buf_block_reader.h */
-// Walks through a buffer in block-sized increments, loading the last part with spaces
-template<size_t STEP_SIZE>
-struct buf_block_reader {
-public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-  really_inline size_t block_index() { return idx; }
-  really_inline bool has_full_block() const {
-    return idx < lenminusstep;
-  }
-  really_inline const uint8_t *full_block() const {
-    return &buf[idx];
-  }
-  really_inline bool has_remainder() const {
-    return idx < len;
-  }
-  really_inline void get_remainder(uint8_t *tmp_buf) const {
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-  }
-  really_inline void advance() {
-    idx += STEP_SIZE;
-  }
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
+    really_inline bool is_ascii(simd8x64<uint8_t> input)
+    {
+      simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
+      return bits.max() < 0b10000000u;
+    }
 
-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  in.store((uint8_t*)buf);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
+    really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3)
+    {
+      simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+      simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+      simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+      // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
+      // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
+      // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
+      // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
+      // The error will be detected there.
+      return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+    }
 
-UNUSED static char * format_mask(uint64_t mask) {
-  static char *buf = (char*)malloc(64 + 1);
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
-}
-/* end file src/generic/buf_block_reader.h */
-/* begin file src/generic/json_string_scanner.h */
-namespace stage1 {
+    really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3)
+    {
+      simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+      simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+      return is_third_byte ^ is_fourth_byte;
+    }
 
-struct json_string_block {
-  // Escaped characters (characters following an escape() character)
-  really_inline uint64_t escaped() const { return _escaped; }
-  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
-  really_inline uint64_t escape() const { return _backslash & ~_escaped; }
-  // Real (non-backslashed) quotes
-  really_inline uint64_t quote() const { return _quote; }
-  // Start quotes of strings
-  really_inline uint64_t string_end() const { return _quote & _in_string; }
-  // End quotes of strings
-  really_inline uint64_t string_start() const { return _quote & ~_in_string; }
-  // Only characters inside the string (not including the quotes)
-  really_inline uint64_t string_content() const { return _in_string & ~_quote; }
-  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
-  really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
-  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
-  really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
-  // Tail of string (everything except the start quote)
-  really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+    /* begin file src/generic/stage1/buf_block_reader.h */
+    // Walks through a buffer in block-sized increments, loading the last part with spaces
+    template <size_t STEP_SIZE>
+    struct buf_block_reader
+    {
+    public:
+      really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+      really_inline size_t block_index();
+      really_inline bool has_full_block() const;
+      really_inline const uint8_t *full_block() const;
+      /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+      really_inline size_t get_remainder(uint8_t *dst) const;
+      really_inline void advance();
 
-  // backslash characters
-  uint64_t _backslash;
-  // escaped characters (backslashed--does not include the hex characters after \u)
-  uint64_t _escaped;
-  // real quotes (non-backslashed ones)
-  uint64_t _quote;
-  // string characters (includes start quote but not end quote)
-  uint64_t _in_string;
-};
+    private:
+      const uint8_t *buf;
+      const size_t len;
+      const size_t lenminusstep;
+      size_t idx;
+    };
 
-// Scans blocks for string characters, storing the state necessary to do so
-class json_string_scanner {
-public:
-  really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
-  really_inline error_code finish(bool streaming);
+    // Routines to print masks and text for debugging bitmask operations
+    UNUSED static char *format_input_text_64(const uint8_t *text)
+    {
+      static char *buf = (char *)malloc(sizeof(simd8x64<uint8_t>) + 1);
+      for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++)
+      {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+      }
+      buf[sizeof(simd8x64<uint8_t>)] = '\0';
+      return buf;
+    }
 
-private:
-  really_inline uint64_t find_escaped(uint64_t escape);
+    // Routines to print masks and text for debugging bitmask operations
+    UNUSED static char *format_input_text(const simd8x64<uint8_t> in)
+    {
+      static char *buf = (char *)malloc(sizeof(simd8x64<uint8_t>) + 1);
+      in.store((uint8_t *)buf);
+      for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++)
+      {
+        if (buf[i] < ' ')
+        {
+          buf[i] = '_';
+        }
+      }
+      buf[sizeof(simd8x64<uint8_t>)] = '\0';
+      return buf;
+    }
 
-  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
-  uint64_t prev_in_string = 0ULL;
-  // Whether the first character of the next iteration is escaped.
-  uint64_t prev_escaped = 0ULL;
-};
+    UNUSED static char *format_mask(uint64_t mask)
+    {
+      static char *buf = (char *)malloc(64 + 1);
+      for (size_t i = 0; i < 64; i++)
+      {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+      }
+      buf[64] = '\0';
+      return buf;
+    }
 
-//
-// Finds escaped characters (characters following \).
-//
-// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
-//
-// Does this by:
-// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
-// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
-// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
-//
-// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
-// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
-// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
-// the start bit causes a carry), and leaves even-bit sequences alone.
-//
-// Example:
-//
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
-// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
-// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
-// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
-// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
-// escaped        |   x  | x x  x x  x x  x  x  |
-// desired        |   x  | x x  x x  x x  x  x  |
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-//
-really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
-  // If there was overflow, pretend the first character isn't a backslash
-  backslash &= ~prev_escaped;
-  uint64_t follows_escape = backslash << 1 | prev_escaped;
+    template <size_t STEP_SIZE>
+    really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
 
-  // Get sequences starting on even bits by clearing out the odd series using +
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
-  uint64_t sequences_starting_on_even_bits;
-  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
-  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+    template <size_t STEP_SIZE>
+    really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
-  // Mask every other backslashed character as an escaped character
-  // Flip the mask for sequences that start on even bits, to correct them
-  return (even_bits ^ invert_mask) & follows_escape;
-}
+    template <size_t STEP_SIZE>
+    really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+    {
+      return idx < lenminusstep;
+    }
 
-//
-// Return a mask of all string characters plus end quotes.
-//
-// prev_escaped is overflow saying whether the next character is escaped.
-// prev_in_string is overflow saying whether we're still in a string.
-//
-// Backslash sequences outside of quotes will be detected in stage 2.
-//
-really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
-  const uint64_t backslash = in.eq('\\');
-  const uint64_t escaped = find_escaped(backslash);
-  const uint64_t quote = in.eq('"') & ~escaped;
-  // prefix_xor flips on bits inside the string (and flips off the end quote).
-  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
-  // (characters inside strings are outside, and characters outside strings are inside).
-  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
-  // right shift of a signed value expected to be well-defined and standard
-  // compliant as of C++20, John Regher from Utah U. says this is fine code
-  prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
-  // Use ^ to turn the beginning quote off, and the end quote on.
-  return {
-    backslash,
-    escaped,
-    quote,
-    in_string
-  };
-}
+    template <size_t STEP_SIZE>
+    really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const
+    {
+      return &buf[idx];
+    }
 
-really_inline error_code json_string_scanner::finish(bool streaming) {
-  if (prev_in_string and (not streaming)) {
-    return UNCLOSED_STRING;
-  }
-  return SUCCESS;
-}
+    template <size_t STEP_SIZE>
+    really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const
+    {
+      memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+      memcpy(dst, buf + idx, len - idx);
+      return len - idx;
+    }
 
-} // namespace stage1
-/* end file src/generic/json_string_scanner.h */
-/* begin file src/generic/json_scanner.h */
-namespace stage1 {
+    template <size_t STEP_SIZE>
+    really_inline void buf_block_reader<STEP_SIZE>::advance()
+    {
+      idx += STEP_SIZE;
+    }
+    /* end file src/generic/stage1/buf_block_reader.h */
+    /* begin file src/generic/stage1/json_string_scanner.h */
+    namespace stage1
+    {
 
-/**
+      struct json_string_block
+      {
+        // Escaped characters (characters following an escape() character)
+        really_inline uint64_t escaped() const { return _escaped; }
+        // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+        really_inline uint64_t escape() const { return _backslash & ~_escaped; }
+        // Real (non-backslashed) quotes
+        really_inline uint64_t quote() const { return _quote; }
+        // Start quotes of strings
+        really_inline uint64_t string_end() const { return _quote & _in_string; }
+        // End quotes of strings
+        really_inline uint64_t string_start() const { return _quote & ~_in_string; }
+        // Only characters inside the string (not including the quotes)
+        really_inline uint64_t string_content() const { return _in_string & ~_quote; }
+        // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+        really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+        // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+        really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+        // Tail of string (everything except the start quote)
+        really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+        // backslash characters
+        uint64_t _backslash;
+        // escaped characters (backslashed--does not include the hex characters after \u)
+        uint64_t _escaped;
+        // real quotes (non-backslashed ones)
+        uint64_t _quote;
+        // string characters (includes start quote but not end quote)
+        uint64_t _in_string;
+      };
+
+      // Scans blocks for string characters, storing the state necessary to do so
+      class json_string_scanner
+      {
+      public:
+        really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
+        really_inline error_code finish(bool streaming);
+
+      private:
+        // Intended to be defined by the implementation
+        really_inline uint64_t find_escaped(uint64_t escape);
+        really_inline uint64_t find_escaped_branchless(uint64_t escape);
+
+        // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+        uint64_t prev_in_string = 0ULL;
+        // Whether the first character of the next iteration is escaped.
+        uint64_t prev_escaped = 0ULL;
+      };
+
+      //
+      // Finds escaped characters (characters following \).
+      //
+      // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+      //
+      // Does this by:
+      // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+      // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+      // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+      //
+      // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+      // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+      // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+      // the start bit causes a carry), and leaves even-bit sequences alone.
+      //
+      // Example:
+      //
+      // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+      // escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+      // odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+      // even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+      // invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+      // follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+      // escaped        |   x  | x x  x x  x x  x  x  |
+      // desired        |   x  | x x  x x  x x  x  x  |
+      // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+      //
+      really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash)
+      {
+        // If there was overflow, pretend the first character isn't a backslash
+        backslash &= ~prev_escaped;
+        uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+        // Get sequences starting on even bits by clearing out the odd series using +
+        const uint64_t even_bits = 0x5555555555555555ULL;
+        uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+        uint64_t sequences_starting_on_even_bits;
+        prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+        uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+        // Mask every other backslashed character as an escaped character
+        // Flip the mask for sequences that start on even bits, to correct them
+        return (even_bits ^ invert_mask) & follows_escape;
+      }
+
+      //
+      // Return a mask of all string characters plus end quotes.
+      //
+      // prev_escaped is overflow saying whether the next character is escaped.
+      // prev_in_string is overflow saying whether we're still in a string.
+      //
+      // Backslash sequences outside of quotes will be detected in stage 2.
+      //
+      really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in)
+      {
+        const uint64_t backslash = in.eq('\\');
+        const uint64_t escaped = find_escaped(backslash);
+        const uint64_t quote = in.eq('"') & ~escaped;
+
+        //
+        // prefix_xor flips on bits inside the string (and flips off the end quote).
+        //
+        // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+        // (characters inside strings are outside, and characters outside strings are inside).
+        //
+        const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+        //
+        // Check if we're still in a string at the end of the box so the next block will know
+        //
+        // right shift of a signed value expected to be well-defined and standard
+        // compliant as of C++20, John Regher from Utah U. says this is fine code
+        //
+        prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
+        // Use ^ to turn the beginning quote off, and the end quote on.
+        return {
+            backslash,
+            escaped,
+            quote,
+            in_string};
+      }
+
+      really_inline error_code json_string_scanner::finish(bool streaming)
+      {
+        if (prev_in_string and (not streaming))
+        {
+          return UNCLOSED_STRING;
+        }
+        return SUCCESS;
+      }
+
+    } // namespace stage1
+    /* end file src/generic/stage1/json_string_scanner.h */
+    /* begin file src/generic/stage1/json_scanner.h */
+    namespace stage1
+    {
+
+      /**
  * A block of scanned json, with information on operators and scalars.
  */
-struct json_block {
-public:
-  /** The start of structurals */
-  really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
-  /** All JSON whitespace (i.e. not in a string) */
-  really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
+      struct json_block
+      {
+      public:
+        /** The start of structurals */
+        really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
+        /** All JSON whitespace (i.e. not in a string) */
+        really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
 
-  // Helpers
+        // Helpers
 
-  /** Whether the given characters are inside a string (only works on non-quotes) */
-  really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
-  /** Whether the given characters are outside a string (only works on non-quotes) */
-  really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
+        /** Whether the given characters are inside a string (only works on non-quotes) */
+        really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
+        /** Whether the given characters are outside a string (only works on non-quotes) */
+        really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
 
-  // string and escape characters
-  json_string_block _string;
-  // whitespace, operators, scalars
-  json_character_block _characters;
-  // whether the previous character was a scalar
-  uint64_t _follows_potential_scalar;
-private:
-  // Potential structurals (i.e. disregarding strings)
+        // string and escape characters
+        json_string_block _string;
+        // whitespace, operators, scalars
+        json_character_block _characters;
+        // whether the previous character was a scalar
+        uint64_t _follows_potential_scalar;
 
-  /** operators plus scalar starts like 123, true and "abc" */
-  really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
-  /** the start of non-operator runs, like 123, true and "abc" */
-  really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
-  /** whether the given character is immediately after a non-operator like 123, true or " */
-  really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
-};
+      private:
+        // Potential structurals (i.e. disregarding strings)
 
-/**
+        /** operators plus scalar starts like 123, true and "abc" */
+        really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
+        /** the start of non-operator runs, like 123, true and "abc" */
+        really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
+        /** whether the given character is immediately after a non-operator like 123, true or " */
+        really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
+      };
+
+      /**
  * Scans JSON for important bits: operators, strings, and scalars.
  *
  * The scanner starts by calculating two distinct things:
  * - string characters (taking \" into account)
  * - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
@@ -1313,7884 +3284,10004 @@
  * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
  * in particular, the operator/scalar bit will find plenty of things that are actually part of
  * strings. When we're done, json_block will fuse the two together by masking out tokens that are
  * part of a string.
  */
-class json_scanner {
-public:
-  really_inline json_block next(const simd::simd8x64<uint8_t> in);
-  really_inline error_code finish(bool streaming);
+      class json_scanner
+      {
+      public:
+        json_scanner() {}
+        really_inline json_block next(const simd::simd8x64<uint8_t> in);
+        really_inline error_code finish(bool streaming);
 
-private:
-  // Whether the last character of the previous iteration is part of a scalar token
-  // (anything except whitespace or an operator).
-  uint64_t prev_scalar = 0ULL;
-  json_string_scanner string_scanner;
-};
+      private:
+        // Whether the last character of the previous iteration is part of a scalar token
+        // (anything except whitespace or an operator).
+        uint64_t prev_scalar = 0ULL;
+        json_string_scanner string_scanner{};
+      };
 
+      //
+      // Check if the current character immediately follows a matching character.
+      //
+      // For example, this checks for quotes with backslashes in front of them:
+      //
+      //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+      //
+      really_inline uint64_t follows(const uint64_t match, uint64_t &overflow)
+      {
+        const uint64_t result = match << 1 | overflow;
+        overflow = match >> 63;
+        return result;
+      }
 
-//
-// Check if the current character immediately follows a matching character.
-//
-// For example, this checks for quotes with backslashes in front of them:
-//
-//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
-//
-really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
-  const uint64_t result = match << 1 | overflow;
-  overflow = match >> 63;
-  return result;
-}
+      //
+      // Check if the current character follows a matching character, with possible "filler" between.
+      // For example, this checks for empty curly braces, e.g.
+      //
+      //     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
+      //
+      really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow)
+      {
+        uint64_t follows_match = follows(match, overflow);
+        uint64_t result;
+        overflow |= uint64_t(add_overflow(follows_match, filler, &result));
+        return result;
+      }
 
-//
-// Check if the current character follows a matching character, with possible "filler" between.
-// For example, this checks for empty curly braces, e.g. 
-//
-//     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
-//
-really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) {
-  uint64_t follows_match = follows(match, overflow);
-  uint64_t result;
-  overflow |= uint64_t(add_overflow(follows_match, filler, &result));
-  return result;
-}
+      really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in)
+      {
+        json_string_block strings = string_scanner.next(in);
+        json_character_block characters = json_character_block::classify(in);
+        uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
+        return {
+            strings,
+            characters,
+            follows_scalar};
+      }
 
-really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
-  json_string_block strings = string_scanner.next(in);
-  json_character_block characters = json_character_block::classify(in);
-  uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
-  return {
-    strings,
-    characters,
-    follows_scalar
-  };
-}
+      really_inline error_code json_scanner::finish(bool streaming)
+      {
+        return string_scanner.finish(streaming);
+      }
 
-really_inline error_code json_scanner::finish(bool streaming) {
-  return string_scanner.finish(streaming);
-}
+    } // namespace stage1
+    /* end file src/generic/stage1/json_scanner.h */
 
-} // namespace stage1
-/* end file src/generic/json_scanner.h */
+    namespace stage1
+    {
+      really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash)
+      {
+        // On ARM, we don't short-circuit this if there are no backslashes, because the branch gives us no
+        // benefit and therefore makes things worse.
+        // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+        return find_escaped_branchless(backslash);
+      }
+    } // namespace stage1
 
-/* begin file src/generic/json_minifier.h */
-// This file contains the common code every implementation uses in stage1
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is included already includes
-// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+    /* begin file src/generic/stage1/json_minifier.h */
+    // This file contains the common code every implementation uses in stage1
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is included already includes
+    // "simdjson/stage1.h" (this simplifies amalgation)
 
-namespace stage1 {
+    namespace stage1
+    {
 
-class json_minifier {
-public:
-  template<size_t STEP_SIZE>
-  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+      class json_minifier
+      {
+      public:
+        template <size_t STEP_SIZE>
+        static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
 
-private:
-  really_inline json_minifier(uint8_t *_dst) : dst{_dst} {}
-  template<size_t STEP_SIZE>
-  really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
-  really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
-  really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
-  json_scanner scanner;
-  uint8_t *dst;
-};
+      private:
+        really_inline json_minifier(uint8_t *_dst)
+            : dst{_dst}
+        {
+        }
+        template <size_t STEP_SIZE>
+        really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+        really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
+        really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+        json_scanner scanner{};
+        uint8_t *dst;
+      };
 
-really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
-  uint64_t mask = block.whitespace();
-  in.compress(mask, dst);
-  dst += 64 - count_ones(mask);
-}
+      really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block)
+      {
+        uint64_t mask = block.whitespace();
+        in.compress(mask, dst);
+        dst += 64 - count_ones(mask);
+      }
 
-really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
-  *dst = '\0';
-  error_code error = scanner.finish(false);
-  if (error) { dst_len = 0; return error; }
-  dst_len = dst - dst_start;
-  return SUCCESS;
-}
+      really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len)
+      {
+        *dst = '\0';
+        error_code error = scanner.finish(false);
+        if (error)
+        {
+          dst_len = 0;
+          return error;
+        }
+        dst_len = dst - dst_start;
+        return SUCCESS;
+      }
 
-template<>
-really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block_buf);
-  simd::simd8x64<uint8_t> in_2(block_buf+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1);
-  this->next(in_2, block_2);
-  reader.advance();
-}
+      template <>
+      really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block_buf);
+        simd::simd8x64<uint8_t> in_2(block_buf + 64);
+        json_block block_1 = scanner.next(in_1);
+        json_block block_2 = scanner.next(in_2);
+        this->next(in_1, block_1);
+        this->next(in_2, block_2);
+        reader.advance();
+      }
 
-template<>
-really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block_buf);
-  json_block block_1 = scanner.next(in_1);
-  this->next(block_buf, block_1);
-  reader.advance();
-}
+      template <>
+      really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block_buf);
+        json_block block_1 = scanner.next(in_1);
+        this->next(block_buf, block_1);
+        reader.advance();
+      }
 
-template<size_t STEP_SIZE>
-error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_minifier minifier(dst);
-  while (reader.has_full_block()) {
-    minifier.step<STEP_SIZE>(reader.full_block(), reader);
-  }
+      template <size_t STEP_SIZE>
+      error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept
+      {
+        buf_block_reader<STEP_SIZE> reader(buf, len);
+        json_minifier minifier(dst);
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    minifier.step<STEP_SIZE>(block, reader);
-  }
+        // Index the first n-1 blocks
+        while (reader.has_full_block())
+        {
+          minifier.step<STEP_SIZE>(reader.full_block(), reader);
+        }
 
-  return minifier.finish(dst, dst_len);
-}
+        // Index the last (remainder) block, padded with spaces
+        uint8_t block[STEP_SIZE];
+        if (likely(reader.get_remainder(block)) > 0)
+        {
+          minifier.step<STEP_SIZE>(block, reader);
+        }
 
-} // namespace stage1
-/* end file src/generic/json_minifier.h */
-WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
-  return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
-}
+        return minifier.finish(dst, dst_len);
+      }
 
-/* begin file src/generic/utf8_lookup2_algorithm.h */
-//
-// Detect Unicode errors.
-//
-// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
-// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
-// are straight up concatenated into the final value. The first byte of a multibyte character is a
-// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
-// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
-// start with 0, because that's what ASCII looks like. Here's what each size looks like:
-//
-// - ASCII (7 bits):              0_______
-// - 2 byte character (11 bits):  110_____ 10______
-// - 3 byte character (17 bits):  1110____ 10______ 10______
-// - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
-// - 5+ byte character (illegal): 11111___ <illegal>
-//
-// There are 5 classes of error that can happen in Unicode:
-//
-// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
-//   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
-//   character.
-//
-//   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
-//
-// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
-//   We detect this by requiring that the next byte after your multibyte character be a new
-//   character--so a continuation after your character is wrong.
-//
-//   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
-//
-// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
-//
-//   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
-//
-// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
-//   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
-//   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
-//
-//   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
-//
-// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
-//   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
-//
-//   e.g. 11101101 10100000 10000000 (U+D800)
-//
-// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
-//   support values with more than 23 bits (which a 4-byte character supports).
-//
-//   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//   
-// Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-// 
-//   Code Points        1st       2s       3s       4s
-//  U+0000..U+007F     00..7F
-//  U+0080..U+07FF     C2..DF   80..BF
-//  U+0800..U+0FFF     E0       A0..BF   80..BF
-//  U+1000..U+CFFF     E1..EC   80..BF   80..BF
-//  U+D000..U+D7FF     ED       80..9F   80..BF
-//  U+E000..U+FFFF     EE..EF   80..BF   80..BF
-//  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
-//  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
-//  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
-//
-using namespace simd;
+    } // namespace stage1
+    /* end file src/generic/stage1/json_minifier.h */
+    WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept
+    {
+      return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+    }
 
-namespace utf8_validation {
+    /* begin file src/generic/stage1/find_next_document_index.h */
+    /**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+    really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser)
+    {
+      // TODO don't count separately, just figure out depth
+      auto arr_cnt = 0;
+      auto obj_cnt = 0;
+      for (auto i = parser.n_structural_indexes - 1; i > 0; i--)
+      {
+        auto idxb = parser.structural_indexes[i];
+        switch (parser.buf[idxb])
+        {
+        case ':':
+        case ',':
+          continue;
+        case '}':
+          obj_cnt--;
+          continue;
+        case ']':
+          arr_cnt--;
+          continue;
+        case '{':
+          obj_cnt++;
+          break;
+        case '[':
+          arr_cnt++;
+          break;
+        }
+        auto idxa = parser.structural_indexes[i - 1];
+        switch (parser.buf[idxa])
+        {
+        case '{':
+        case '[':
+        case ':':
+        case ',':
+          continue;
+        }
+        // Last document is complete, so the next document will appear after!
+        if (!arr_cnt && !obj_cnt)
+        {
+          return parser.n_structural_indexes;
+        }
+        // Last document is incomplete; mark the document at i + 1 as the next one
+        return i;
+      }
+      return 0;
+    }
 
-  //
-  // Find special case UTF-8 errors where the character is technically readable (has the right length)
-  // but the *value* is disallowed.
-  //
-  // This includes overlong encodings, surrogates and values too large for Unicode.
-  //
-  // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
-  // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
-  // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
-  // If all 3 lookups detect the same error, it's an error.
-  //
-  really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+    // Skip the last character if it is partial
+    really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len)
+    {
+      if (unlikely(len < 3))
+      {
+        switch (len)
+        {
+        case 2:
+          if (buf[len - 1] >= 0b11000000)
+          {
+            return len - 1;
+          } // 2-, 3- and 4-byte characters with only 1 byte left
+          if (buf[len - 2] >= 0b11100000)
+          {
+            return len - 2;
+          } // 3- and 4-byte characters with only 2 bytes left
+          return len;
+        case 1:
+          if (buf[len - 1] >= 0b11000000)
+          {
+            return len - 1;
+          } // 2-, 3- and 4-byte characters with only 1 byte left
+          return len;
+        case 0:
+          return len;
+        }
+      }
+      if (buf[len - 1] >= 0b11000000)
+      {
+        return len - 1;
+      } // 2-, 3- and 4-byte characters with only 1 byte left
+      if (buf[len - 2] >= 0b11100000)
+      {
+        return len - 2;
+      } // 3- and 4-byte characters with only 1 byte left
+      if (buf[len - 3] >= 0b11110000)
+      {
+        return len - 3;
+      } // 4-byte characters with only 3 bytes left
+      return len;
+    }
+    /* end file src/generic/stage1/find_next_document_index.h */
+    /* begin file src/generic/stage1/utf8_lookup3_algorithm.h */
     //
-    // These are the errors we're going to match for bytes 1-2, by looking at the first three
-    // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
+    // Detect Unicode errors.
     //
-    static const int OVERLONG_2  = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
-    static const int OVERLONG_3  = 0x02; // 11100000 100_____ ________
-    static const int OVERLONG_4  = 0x04; // 11110000 1000____ ________ ________
-    static const int SURROGATE   = 0x08; // 11101101 [101_]____
-    static const int TOO_LARGE   = 0x10; // 11110100 (1001|101_)____
-    static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
+    // UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
+    // encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
+    // are straight up concatenated into the final value. The first byte of a multibyte character is a
+    // "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
+    // lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
+    // start with 0, because that's what ASCII looks like. Here's what each size looks like:
+    //
+    // - ASCII (7 bits):              0_______
+    // - 2 byte character (11 bits):  110_____ 10______
+    // - 3 byte character (17 bits):  1110____ 10______ 10______
+    // - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
+    // - 5+ byte character (illegal): 11111___ <illegal>
+    //
+    // There are 5 classes of error that can happen in Unicode:
+    //
+    // - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
+    //   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
+    //   character.
+    //
+    //   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
+    //
+    // - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
+    //   We detect this by requiring that the next byte after your multibyte character be a new
+    //   character--so a continuation after your character is wrong.
+    //
+    //   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
+    //
+    // - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
+    //
+    //   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
+    //
+    // - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
+    //   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
+    //   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
+    //
+    //   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
+    //
+    // - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
+    //   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
+    //
+    //   e.g. 11101101 10100000 10000000 (U+D800)
+    //
+    // - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
+    //   support values with more than 23 bits (which a 4-byte character supports).
+    //
+    //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
+    //
+    // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
+    //
+    //   Code Points        1st       2s       3s       4s
+    //  U+0000..U+007F     00..7F
+    //  U+0080..U+07FF     C2..DF   80..BF
+    //  U+0800..U+0FFF     E0       A0..BF   80..BF
+    //  U+1000..U+CFFF     E1..EC   80..BF   80..BF
+    //  U+D000..U+D7FF     ED       80..9F   80..BF
+    //  U+E000..U+FFFF     EE..EF   80..BF   80..BF
+    //  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+    //  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+    //  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+    //
+    using namespace simd;
 
-    // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
-    // byte 2 to be sure which things are errors and which aren't.
-    // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
-    static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
-    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ASCII: ________ [0___]____
-        CARRY, CARRY, CARRY, CARRY,
-        // ASCII: ________ [0___]____
-        CARRY, CARRY, CARRY, CARRY,
-        // Continuations: ________ [10__]____
-        CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
-        CARRY | OVERLONG_3 | TOO_LARGE,  // ________ [1001]____
-        CARRY | TOO_LARGE  | SURROGATE,  // ________ [1010]____
-        CARRY | TOO_LARGE  | SURROGATE,  // ________ [1011]____
-        // Multibyte Leads: ________ [11__]____
-        CARRY, CARRY, CARRY, CARRY
-    );
+    namespace utf8_validation
+    {
+      // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
-    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // [0___]____ (ASCII)
-      0, 0, 0, 0,                          
-      0, 0, 0, 0,
-      // [10__]____ (continuation)
-      0, 0, 0, 0,
-      // [11__]____ (2+-byte leads)
-      OVERLONG_2, 0,                       // [110_]____ (2-byte lead)
-      OVERLONG_3 | SURROGATE,              // [1110]____ (3-byte lead)
-      OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead)
-    );
+      //
+      // Find special case UTF-8 errors where the character is technically readable (has the right length)
+      // but the *value* is disallowed.
+      //
+      // This includes overlong encodings, surrogates and values too large for Unicode.
+      //
+      // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
+      // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
+      // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
+      // If all 3 lookups detect the same error, it's an error.
+      //
+      really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+      {
+        //
+        // These are the errors we're going to match for bytes 1-2, by looking at the first three
+        // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
+        //
+        static const int OVERLONG_2 = 0x01;  // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
+        static const int OVERLONG_3 = 0x02;  // 11100000 100_____ ________
+        static const int OVERLONG_4 = 0x04;  // 11110000 1000____ ________ ________
+        static const int SURROGATE = 0x08;   // 11101101 [101_]____
+        static const int TOO_LARGE = 0x10;   // 11110100 (1001|101_)____
+        static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
 
-    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____[00__] ________
-      OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
-      OVERLONG_2,                           // ____[0001] ________
-      0, 0,
-      // ____[01__] ________
-      TOO_LARGE,                            // ____[0100] ________
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      // ____[10__] ________
-      TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
-      // ____[11__] ________
-      TOO_LARGE_2,
-      TOO_LARGE_2 | SURROGATE,                            // ____[1101] ________
-      TOO_LARGE_2, TOO_LARGE_2
-    );
+        // New with lookup3. We want to catch the case where an non-continuation
+        // follows a leading byte
+        static const int TOO_SHORT_2_3_4 = 0x40; //  (110_|1110|1111) ____    (0___|110_|1111) ____
+        // We also want to catch a continuation that is preceded by an ASCII byte
+        static const int LONELY_CONTINUATION = 0x80; //  0___ ____    01__ ____
 
-    return byte_1_high & byte_1_low & byte_2_high;
-  }
+        // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
+        // byte 2 to be sure which things are errors and which aren't.
+        // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
+        static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
+        const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+            // ASCII: ________ [0___]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            // ASCII: ________ [0___]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            // Continuations: ________ [10__]____
+            CARRY | OVERLONG_3 | OVERLONG_4 | LONELY_CONTINUATION, // ________ [1000]____
+            CARRY | OVERLONG_3 | TOO_LARGE | LONELY_CONTINUATION,  // ________ [1001]____
+            CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION,   // ________ [1010]____
+            CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION,   // ________ [1011]____
+            // Multibyte Leads: ________ [11__]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4, // 110_
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4);
+        const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+            // [0___]____ (ASCII)
+            LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+            LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+            // [10__]____ (continuation)
+            0, 0, 0, 0,
+            // [11__]____ (2+-byte leads)
+            OVERLONG_2 | TOO_SHORT_2_3_4, TOO_SHORT_2_3_4,         // [110_]____ (2-byte lead)
+            OVERLONG_3 | SURROGATE | TOO_SHORT_2_3_4,              // [1110]____ (3-byte lead)
+            OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 | TOO_SHORT_2_3_4 // [1111]____ (4+-byte lead)
+        );
+        const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+            // ____[00__] ________
+            OVERLONG_2 | OVERLONG_3 | OVERLONG_4 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0000] ________
+            OVERLONG_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,                           // ____[0001] ________
+            TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[01__] ________
+            TOO_LARGE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0100] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[10__] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[11__] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            TOO_LARGE_2 | SURROGATE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[1101] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION);
+        return byte_1_high & byte_1_low & byte_2_high;
+      }
 
-  //
-  // Validate the length of multibyte characters (that each multibyte character has the right number
-  // of continuation characters, and that all continuation characters are part of a multibyte
-  // character).
-  //
-  // Algorithm
-  // =========
-  //
-  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
-  // and emits an error anytime there is a mismatch.
-  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
-  // characters, the file will look like this:
-  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
-  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
-  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
-  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
-  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
-  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
-  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
-  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  //
-  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
-  //
-  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
-  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
-  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
-  //   or maybe it's at the beginning of the file before any characters have started; but it's an
-  //   error in all these cases.
-  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
-  //   we started a new character before we were finished with the current one.
-  //
-  // Getting the Previous Bytes
-  // --------------------------
-  //
-  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
-  // character, we need to "shift the bytes" to find that out. This is what they mean:
-  //
-  // - `is_continuation`: if the current byte is a continuation.
-  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
-  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
-  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
-  //
-  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
-  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
-  // function, because the 1-byte-back data is used by other checks as well.
-  //
-  // Getting the Continuation Mask
-  // -----------------------------
-  //
-  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
-  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
-  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
-  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
-  //
-  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
-  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
-  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
-  //
-  // When treated as signed numbers, they look like this:
-  //
-  // | Type         | High Bits  | Binary Range | Signed |
-  // |--------------|------------|--------------|--------|
-  // | ASCII        | `0`        | `01111111`   |   127  |
-  // |              |            | `00000000`   |     0  |
-  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
-  // |              |            | `11110000    |   -16  |
-  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
-  // |              |            | `11100000    |   -32  |
-  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
-  // |              |            | `11000000    |   -64  |
-  // | Continuation | `10`       | `10111111`   |   -65  |
-  // |              |            | `10000000    |  -128  |
-  //
-  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
-  //
-  // ```
-  // is_continuation = input < -64`
-  // ```
-  //
-  // We can do something similar for the others, but it takes two comparisons instead of one: "is
-  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
-  // `> -64`. Surely we can do better, they're right next to each other!
-  //
-  // Getting the is_xxx Masks: Shifting the Range
-  // --------------------------------------------
-  //
-  // Notice *why* continuations were a single comparison. The actual *range* would require two
-  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
-  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
-  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
-  //
-  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
-  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
-  //
-  // | Type                 | High Bits  | Binary Range | Signed |
-  // |----------------------|------------|--------------|-------|
-  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
-  // |                      |            | `01110000    |   112 |
-  // |----------------------|------------|--------------|-------|
-  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
-  // |                      |            | `01100000    |    96 |
-  // |----------------------|------------|--------------|-------|
-  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
-  // |                      |            | `01000000    |    64 |
-  // |----------------------|------------|--------------|-------|
-  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
-  // |                      |            | `00000000    |     0 |
-  // |----------------------|------------|--------------|-------|
-  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
-  // |                      |            | `10000000`   |  -128 |
-  // |----------------------|------------|--------------|-------|
-  // 
-  // *Now* we can use signed `>` on all of them:
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev2 = input.prev<2>
-  // prev3 = input.prev<3>
-  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
-  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
-  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
-  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
-  // ```
-  //
-  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
-  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
-  // `^`'s at a time on Haswell, but only 2 `+`'s).
-  //
-  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
-  // same number to all of them, we can save one of those `+ 128` operations by assembling
-  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
-  // to it. One more instruction saved!
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev3 = input.prev<3>
-  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
-  // ```
-  //
-  // ### Bringing It All Together: Detecting the Errors
-  //
-  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
-  // All we have left to do is check if they match!
-  //
-  // ```
-  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
-  // ```
-  //
-  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
-  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
-  // bitwise operations, and we're only using 1!
-  //
-  // Epilogue: Addition For Booleans
-  // -------------------------------
-  //
-  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
-  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
-  // to each other (or any combination), and the continuation could be part of either of them!
-  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
-  //
-  // Never fear, though. If that situation occurs, we'll already have detected that the second
-  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
-  // character, but it *wasn't a continuation*.
-  //
-  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
-  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
-  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
-  // comparisons were giving us numbers!
-  //
-  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
-  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
-  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
-  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
-  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
-  //
-  // Further, if *more than one* multibyte character overlaps,
-  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
-  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
-  // considered an error.
-  //
-  // One reason you might want to do this is parallelism. ^ and | are not associative, so
-  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
-  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
-  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
-  // then adds the result together. Same number of operations, but if the processor can run
-  // independent things in parallel (which most can), it runs faster.
-  //
-  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
-  // a super nice advantage in that more of them can be run at the same time (they can run on 3
-  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
-  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
-  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
-  // out of the way," running on a port other instructions can't.
-  // 
-  // Epilogue II: One More Trick
-  // ---------------------------
-  //
-  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
-  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
-  // check_special_cases()--but we'll talk about that there :)
-  //
-  really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
-    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+      really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input,
+                                                           simd8<uint8_t> prev1)
+      {
+        simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+        simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+        // is_2_3_continuation uses one more instruction than lookup2
+        simd8<bool> is_2_3_continuation = (simd8<int8_t>(input).max(simd8<int8_t>(prev1))) < int8_t(-64);
+        // must_be_2_3_continuation has two fewer instructions than lookup 2
+        return simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3) ^ is_2_3_continuation);
+      }
 
-    // Cont is 10000000-101111111 (-65...-128)
-    simd8<bool> is_continuation = simd8<int8_t>(input) < int8_t(-64);
-    // must_be_continuation is architecture-specific because Intel doesn't have unsigned comparisons
-    return simd8<uint8_t>(must_be_continuation(prev1, prev2, prev3) ^ is_continuation);
-  }
+      //
+      // Return nonzero if there are incomplete multibyte characters at the end of the block:
+      // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+      //
+      really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input)
+      {
+        // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+        // ... 1111____ 111_____ 11______
+        static const uint8_t max_array[32] = {
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1};
+        const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+        return input.gt_bits(max_value);
+      }
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
-    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
-    // ... 1111____ 111_____ 11______
-    static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
-    };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
-    return input.gt_bits(max_value);
-  }
+      struct utf8_checker
+      {
+        // If this is nonzero, there has been a UTF-8 error.
+        simd8<uint8_t> error;
+        // The last input we received
+        simd8<uint8_t> prev_input_block;
+        // Whether the last input we received was incomplete (used for ASCII fast path)
+        simd8<uint8_t> prev_incomplete;
 
-  struct utf8_checker {
-    // If this is nonzero, there has been a UTF-8 error.
-    simd8<uint8_t> error;
-    // The last input we received
-    simd8<uint8_t> prev_input_block;
-    // Whether the last input we received was incomplete (used for ASCII fast path)
-    simd8<uint8_t> prev_incomplete;
+        //
+        // Check whether the current bytes are valid UTF-8.
+        //
+        really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+        {
+          // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+          // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+          simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+          this->error |= check_special_cases(input, prev1);
+          this->error |= check_multibyte_lengths(input, prev_input, prev1);
+        }
 
-    //
-    // Check whether the current bytes are valid UTF-8.
-    //
-    really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      this->error |= check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, prev1);
-    }
+        // The only problem that can happen at EOF is that a multibyte character is too short.
+        really_inline void check_eof()
+        {
+          // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+          // possibly finish them.
+          this->error |= this->prev_incomplete;
+        }
 
-    // The only problem that can happen at EOF is that a multibyte character is too short.
-    really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
-    }
+        really_inline void check_next_input(simd8x64<uint8_t> input)
+        {
+          if (likely(is_ascii(input)))
+          {
+            // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+            // possibly finish them.
+            this->error |= this->prev_incomplete;
+          }
+          else
+          {
+            this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+            for (int i = 1; i < simd8x64<uint8_t>::NUM_CHUNKS; i++)
+            {
+              this->check_utf8_bytes(input.chunks[i], input.chunks[i - 1]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+          }
+        }
 
-    really_inline void check_next_input(simd8x64<uint8_t> input) {
-      if (likely(is_ascii(input))) {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error |= this->prev_incomplete;
-      } else {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
-          this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
+        really_inline error_code errors()
+        {
+          return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-      }
-    }
 
-    really_inline error_code errors() {
-      return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
-    }
+      }; // struct utf8_checker
+    }    // namespace utf8_validation
 
-  }; // struct utf8_checker
-}
+    using utf8_validation::utf8_checker;
+    /* end file src/generic/stage1/utf8_lookup3_algorithm.h */
+    /* begin file src/generic/stage1/json_structural_indexer.h */
+    // This file contains the common code every implementation uses in stage1
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is included already includes
+    // "simdjson/stage1.h" (this simplifies amalgation)
 
-using utf8_validation::utf8_checker;
-/* end file src/generic/utf8_lookup2_algorithm.h */
-/* begin file src/generic/json_structural_indexer.h */
-// This file contains the common code every implementation uses in stage1
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is included already includes
-// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+    namespace stage1
+    {
 
-namespace stage1 {
+      class bit_indexer
+      {
+      public:
+        uint32_t *tail;
 
-class bit_indexer {
-public:
-  uint32_t *tail;
+        really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
-  really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+        // flatten out values in 'bits' assuming that they are are to have values of idx
+        // plus their position in the bitvector, and store these indexes at
+        // base_ptr[base] incrementing base as we go
+        // will potentially store extra values beyond end of valid bits, so base_ptr
+        // needs to be large enough to handle this
+        really_inline void write(uint32_t idx, uint64_t bits)
+        {
+          // In some instances, the next branch is expensive because it is mispredicted.
+          // Unfortunately, in other cases,
+          // it helps tremendously.
+          if (bits == 0)
+            return;
+          int cnt = static_cast<int>(count_ones(bits));
 
-  // flatten out values in 'bits' assuming that they are are to have values of idx
-  // plus their position in the bitvector, and store these indexes at
-  // base_ptr[base] incrementing base as we go
-  // will potentially store extra values beyond end of valid bits, so base_ptr
-  // needs to be large enough to handle this
-  really_inline void write(uint32_t idx, uint64_t bits) {
-    // In some instances, the next branch is expensive because it is mispredicted.
-    // Unfortunately, in other cases,
-    // it helps tremendously.
-    if (bits == 0)
-        return;
-    uint32_t cnt = count_ones(bits);
+          // Do the first 8 all together
+          for (int i = 0; i < 8; i++)
+          {
+            this->tail[i] = idx + trailing_zeroes(bits);
+            bits = clear_lowest_bit(bits);
+          }
 
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
+          // Do the next 8 all together (we hope in most cases it won't happen at all
+          // and the branch is easily predicted).
+          if (unlikely(cnt > 8))
+          {
+            for (int i = 8; i < 16; i++)
+            {
+              this->tail[i] = idx + trailing_zeroes(bits);
+              bits = clear_lowest_bit(bits);
+            }
 
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
+            // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+            // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+            // or the start of a value ("abc" true 123) every four characters.
+            if (unlikely(cnt > 16))
+            {
+              int i = 16;
+              do
+              {
+                this->tail[i] = idx + trailing_zeroes(bits);
+                bits = clear_lowest_bit(bits);
+                i++;
+              } while (i < cnt);
+            }
+          }
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (unlikely(cnt > 16)) {
-        uint32_t i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
+          this->tail += cnt;
+        }
+      };
 
-    this->tail += cnt;
-  }
-};
+      class json_structural_indexer
+      {
+      public:
+        /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+        template <size_t STEP_SIZE>
+        static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
 
-class json_structural_indexer {
-public:
-  template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
+      private:
+        really_inline json_structural_indexer(uint32_t *structural_indexes);
+        template <size_t STEP_SIZE>
+        really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+        really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
+        really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
 
-private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
-  template<size_t STEP_SIZE>
-  really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
-  really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
+        json_scanner scanner{};
+        utf8_checker checker{};
+        bit_indexer indexer;
+        uint64_t prev_structurals = 0;
+        uint64_t unescaped_chars_error = 0;
+      };
 
-  json_scanner scanner;
-  utf8_checker checker{};
-  bit_indexer indexer;
-  uint64_t prev_structurals = 0;
-  uint64_t unescaped_chars_error = 0;
-};
+      really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(idx-64, prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
+      //
+      // PERF NOTES:
+      // We pipe 2 inputs through these stages:
+      // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+      //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+      // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+      //    The output of step 1 depends entirely on this information. These functions don't quite use
+      //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+      //    at a time. The second input's scans has some dependency on the first ones finishing it, but
+      //    they can make a lot of progress before they need that information.
+      // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+      //    to finish: utf-8 checks and generating the output from the last iteration.
+      //
+      // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+      // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+      // workout.
+      //
+      template <size_t STEP_SIZE>
+      error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept
+      {
+        if (unlikely(len > parser.capacity()))
+        {
+          return CAPACITY;
+        }
+        if (partial)
+        {
+          len = trim_partial_utf8(buf, len);
+        }
 
-really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
-  // Write out the final iteration's structurals
-  indexer.write(idx-64, prev_structurals);
+        buf_block_reader<STEP_SIZE> reader(buf, len);
+        json_structural_indexer indexer(parser.structural_indexes.get());
 
-  error_code error = scanner.finish(streaming);
-  if (unlikely(error != SUCCESS)) { return error; }
+        // Read all but the last block
+        while (reader.has_full_block())
+        {
+          indexer.step<STEP_SIZE>(reader.full_block(), reader);
+        }
 
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
-  }
+        // Take care of the last block (will always be there unless file is empty)
+        uint8_t block[STEP_SIZE];
+        if (unlikely(reader.get_remainder(block) == 0))
+        {
+          return EMPTY;
+        }
+        indexer.step<STEP_SIZE>(block, reader);
 
-  parser.n_structural_indexes = indexer.tail - parser.structural_indexes.get();
-  /* a valid JSON file cannot have zero structural indexes - we should have
-   * found something */
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
-    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending character. */
-    parser.structural_indexes[parser.n_structural_indexes++] = len;
-  }
-  /* make it safe to dereference one beyond this array */
-  parser.structural_indexes[parser.n_structural_indexes] = 0;
-  return checker.errors();
-}
+        return indexer.finish(parser, reader.block_index(), len, partial);
+      }
 
-template<>
-really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  simd::simd8x64<uint8_t> in_2(block+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1, reader.block_index());
-  this->next(in_2, block_2, reader.block_index()+64);
-  reader.advance();
-}
+      template <>
+      really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block);
+        simd::simd8x64<uint8_t> in_2(block + 64);
+        json_block block_1 = scanner.next(in_1);
+        json_block block_2 = scanner.next(in_2);
+        this->next(in_1, block_1, reader.block_index());
+        this->next(in_2, block_2, reader.block_index() + 64);
+        reader.advance();
+      }
 
-template<>
-really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  json_block block_1 = scanner.next(in_1);
-  this->next(in_1, block_1, reader.block_index());
-  reader.advance();
-}
+      template <>
+      really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block);
+        json_block block_1 = scanner.next(in_1);
+        this->next(in_1, block_1, reader.block_index());
+        reader.advance();
+      }
 
-//
-// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-// 
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
-// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
-// you may want to call on a function like trimmed_length_safe_utf8.
-template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
-  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+      really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx)
+      {
+        uint64_t unescaped = in.lteq(0x1F);
+        checker.check_next_input(in);
+        indexer.write(uint32_t(idx - 64), prev_structurals); // Output *last* iteration's structurals to the parser
+        prev_structurals = block.structural_start();
+        unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+      }
 
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_structural_indexer indexer(parser.structural_indexes.get());
-  while (reader.has_full_block()) {
-    indexer.step<STEP_SIZE>(reader.full_block(), reader);
-  }
+      really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial)
+      {
+        // Write out the final iteration's structurals
+        indexer.write(uint32_t(idx - 64), prev_structurals);
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    indexer.step<STEP_SIZE>(block, reader);
-  }
+        error_code error = scanner.finish(partial);
+        if (unlikely(error != SUCCESS))
+        {
+          return error;
+        }
 
-  return indexer.finish(parser, reader.block_index(), len, streaming);
-}
+        if (unescaped_chars_error)
+        {
+          return UNESCAPED_CHARS;
+        }
 
-} // namespace stage1
-/* end file src/generic/json_structural_indexer.h */
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
-}
+        parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+        /***
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   **/
+        parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+        parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+        parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+        parser.next_structural_index = 0;
+        // a valid JSON file cannot have zero structural indexes - we should have found something
+        if (unlikely(parser.n_structural_indexes == 0u))
+        {
+          return EMPTY;
+        }
+        if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len))
+        {
+          return UNEXPECTED_ERROR;
+        }
+        if (partial)
+        {
+          auto new_structural_indexes = find_next_document_index(parser);
+          if (new_structural_indexes == 0 && parser.n_structural_indexes > 0)
+          {
+            return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+          }
+          parser.n_structural_indexes = new_structural_indexes;
+        }
+        return checker.errors();
+      }
 
-} // namespace simdjson::arm64
+    } // namespace stage1
+    /* end file src/generic/stage1/json_structural_indexer.h */
+    WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept
+    {
+      this->buf = _buf;
+      this->len = _len;
+      return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
+    }
+    /* begin file src/generic/stage1/utf8_validator.h */
+    namespace stage1
+    {
+      /**
+ * Validates that the string is actual UTF-8.
+ */
+      template <class checker>
+      bool generic_validate_utf8(const uint8_t *input, size_t length)
+      {
+        checker c{};
+        buf_block_reader<64> reader(input, length);
+        while (reader.has_full_block())
+        {
+          simd::simd8x64<uint8_t> in(reader.full_block());
+          c.check_next_input(in);
+          reader.advance();
+        }
+        uint8_t block[64]{};
+        reader.get_remainder(block);
+        simd::simd8x64<uint8_t> in(block);
+        c.check_next_input(in);
+        reader.advance();
+        return c.errors() == error_code::SUCCESS;
+      }
 
-#endif // SIMDJSON_ARM64_STAGE1_FIND_MARKS_H
-/* end file src/generic/json_structural_indexer.h */
-#endif
-#if SIMDJSON_IMPLEMENTATION_FALLBACK
-/* begin file src/fallback/stage1_find_marks.h */
-#ifndef SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
-#define SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
+      bool generic_validate_utf8(const char *input, size_t length)
+      {
+        return generic_validate_utf8<utf8_checker>((const uint8_t *)input, length);
+      }
 
-/* fallback/implementation.h already included: #include "fallback/implementation.h" */
+    } // namespace stage1
+    /* end file src/generic/stage1/utf8_validator.h */
+    WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept
+    {
+      return simdjson::arm64::stage1::generic_validate_utf8(buf, len);
+    }
+  } // namespace arm64
+} // namespace simdjson
 
-namespace simdjson::fallback::stage1 {
+//
+// Stage 2
+//
 
-class structural_scanner {
-public:
+/* begin file src/arm64/stringparsing.h */
+#ifndef SIMDJSON_ARM64_STRINGPARSING_H
+#define SIMDJSON_ARM64_STRINGPARSING_H
 
-really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, parser &_doc_parser, bool _streaming)
-  : buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {}
+/* jsoncharutils.h already included: #include "jsoncharutils.h" */
+/* arm64/simd.h already included: #include "arm64/simd.h" */
+/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
+/* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */
 
-really_inline void add_structural() {
-  *next_structural_index = idx;
-  next_structural_index++;
-}
+namespace simdjson
+{
+  namespace arm64
+  {
 
-really_inline bool is_continuation(uint8_t c) {
-  return (c & 0b11000000) == 0b10000000;
-}
+    using namespace simd;
 
-really_inline void validate_utf8_character() {
-  // Continuation
-  if (unlikely((buf[idx] & 0b01000000) == 0)) {
-    // extra continuation
-    error = UTF8_ERROR;
-    idx++;
-    return;
-  }
+    // Holds backslashes and quotes locations.
+    struct backslash_and_quote
+    {
+    public:
+      static constexpr uint32_t BYTES_PROCESSED = 32;
+      really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
 
-  // 2-byte
-  if ((buf[idx] & 0b00100000) == 0) {
-    // missing continuation
-    if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
-    // overlong: 1100000_ 10______
-    if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
-    idx += 2;
-    return;
-  }
+      really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
+      really_inline bool has_backslash() { return bs_bits != 0; }
+      really_inline int quote_index() { return trailing_zeroes(quote_bits); }
+      really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
 
-  // 3-byte
-  if ((buf[idx] & 0b00010000) == 0) {
-    // missing continuation
-    if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
-    // overlong: 11100000 100_____ ________
-    if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
-    // surrogates: U+D800-U+DFFF 11101101 101_____
-    if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; }
-    idx += 3;
-    return;
-  }
+      uint32_t bs_bits;
+      uint32_t quote_bits;
+    }; // struct backslash_and_quote
 
-  // 4-byte
-  // missing continuation
-  if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
-  // overlong: 11110000 1000____ ________ ________
-  if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
-  // too large: > U+10FFFF:
-  // 11110100 (1001|101_)____
-  // 1111(1___|011_|0101) 10______
-  // also includes 5, 6, 7 and 8 byte characters:
-  // 11111___
-  if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; }
-  if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; }
-  idx += 4;
-}
+    really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst)
+    {
+      // this can read up to 31 bytes beyond the buffer size, but we require
+      // SIMDJSON_PADDING of padding
+      static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
+      simd8<uint8_t> v0(src);
+      simd8<uint8_t> v1(src + sizeof(v0));
+      v0.store(dst);
+      v1.store(dst + sizeof(v0));
 
-really_inline void validate_string() {
-  idx++; // skip first quote
-  while (idx < len && buf[idx] != '"') {
-    if (buf[idx] == '\\') {
-      idx += 2;
-    } else if (unlikely(buf[idx] & 0b10000000)) {
-      validate_utf8_character();
-    } else {
-      if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
-      idx++;
+      // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we
+      // smash them together into a 64-byte mask and get the bitmask from there.
+      uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
+      return {
+          uint32_t(bs_and_quote),      // bs_bits
+          uint32_t(bs_and_quote >> 32) // quote_bits
+      };
     }
-  }
-  if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
-}
 
-really_inline bool is_whitespace_or_operator(uint8_t c) {
-  switch (c) {
-    case '{': case '}': case '[': case ']': case ',': case ':':
-    case ' ': case '\r': case '\n': case '\t':
-      return true;
-    default:
-      return false;
-  }
-}
+    /* begin file src/generic/stage2/stringparsing.h */
+    // This file contains the common code every implementation uses
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "stringparsing.h" (this simplifies amalgation)
 
-//
-// Parse the entire input in STEP_SIZE-byte chunks.
-//
-really_inline error_code scan() {
-  for (;idx<len;idx++) {
-    switch (buf[idx]) {
-      // String
-      case '"':
-        add_structural();
-        validate_string();
-        break;
-      // Operator
-      case '{': case '}': case '[': case ']': case ',': case ':':
-        add_structural();
-        break;
-      // Whitespace
-      case ' ': case '\r': case '\n': case '\t':
-        break;
-      // Primitive or invalid character (invalid characters will be checked in stage 2)
-      default:
-        // Anything else, add the structural and go until we find the next one
-        add_structural();
-        while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
-          idx++;
-        };
-        break;
-    }
-  }
-  if (unlikely(next_structural_index == doc_parser.structural_indexes.get())) {
-    return EMPTY;
-  }
-  *next_structural_index = len;
-  next_structural_index++;
-  doc_parser.n_structural_indexes = next_structural_index - doc_parser.structural_indexes.get();
-  return error;
-}
+    namespace stage2
+    {
+      namespace stringparsing
+      {
 
-private:
-  const uint8_t *buf;
-  uint32_t *next_structural_index;
-  parser &doc_parser;
-  uint32_t idx;
-  uint32_t len;
-  error_code error;
-  bool streaming;
-}; // structural_scanner
+        // begin copypasta
+        // These chars yield themselves: " \ /
+        // b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+        // u not handled in this table as it's complex
+        static const uint8_t escape_map[256] = {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x0.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x22,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x2f,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
 
-} // simdjson::fallback::stage1
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x4.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x5c,
+            0,
+            0,
+            0, // 0x5.
+            0,
+            0,
+            0x08,
+            0,
+            0,
+            0,
+            0x0c,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x0a,
+            0, // 0x6.
+            0,
+            0,
+            0x0d,
+            0,
+            0x09,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x7.
 
-namespace simdjson::fallback {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
 
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  if (unlikely(len > parser.capacity())) {
-    return CAPACITY;
-  }
-  stage1::structural_scanner scanner(buf, len, parser, streaming);
-  return scanner.scan();
-}
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        };
 
-// big table for the minifier
-static uint8_t jump_table[256 * 3] = {
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
-    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-};
+        // handle a unicode codepoint
+        // write appropriate values into dest
+        // src will advance 6 bytes or 12 bytes
+        // dest will advance a variable amount (return via pointer)
+        // return true if the unicode codepoint was valid
+        // We work in little-endian then swap at write time
+        WARN_UNUSED
+        really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                                    uint8_t **dst_ptr)
+        {
+          // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+          // conversion isn't valid; we defer the check for this to inside the
+          // multilingual plane check
+          uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
+          *src_ptr += 6;
+          // check for low surrogate for characters outside the Basic
+          // Multilingual Plane.
+          if (code_point >= 0xd800 && code_point < 0xdc00)
+          {
+            if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u')
+            {
+              return false;
+            }
+            uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
 
-WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
-  size_t i = 0, pos = 0;
-  uint8_t quote = 0;
-  uint8_t nonescape = 1;
+            // if the first code point is invalid we will get here, as we will go past
+            // the check for being outside the Basic Multilingual plane. If we don't
+            // find a \u immediately afterwards we fail out anyhow, but if we do,
+            // this check catches both the case of the first code point being invalid
+            // or the second code point being invalid.
+            if ((code_point | code_point_2) >> 16)
+            {
+              return false;
+            }
 
-  while (i < len) {
-    unsigned char c = buf[i];
-    uint8_t *meta = jump_table + 3 * c;
+            code_point =
+                (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
+            *src_ptr += 6;
+          }
+          size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
+          *dst_ptr += offset;
+          return offset > 0;
+        }
 
-    quote = quote ^ (meta[0] & nonescape);
-    dst[pos] = c;
-    pos += meta[2] | quote;
+        WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst)
+        {
+          src++;
+          while (1)
+          {
+            // Copy the next n bytes, and find the backslash and quote in them.
+            auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+            // If the next thing is the end quote, copy and return
+            if (bs_quote.has_quote_first())
+            {
+              // we encountered quotes first. Move dst to point to quotes and exit
+              return dst + bs_quote.quote_index();
+            }
+            if (bs_quote.has_backslash())
+            {
+              /* find out where the backspace is */
+              auto bs_dist = bs_quote.backslash_index();
+              uint8_t escape_char = src[bs_dist + 1];
+              /* we encountered backslash first. Handle backslash */
+              if (escape_char == 'u')
+              {
+                /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+                src += bs_dist;
+                dst += bs_dist;
+                if (!handle_unicode_codepoint(&src, &dst))
+                {
+                  return nullptr;
+                }
+              }
+              else
+              {
+                /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+                uint8_t escape_result = escape_map[escape_char];
+                if (escape_result == 0u)
+                {
+                  return nullptr; /* bogus escape value is an error */
+                }
+                dst[bs_dist] = escape_result;
+                src += bs_dist + 2;
+                dst += bs_dist + 1;
+              }
+            }
+            else
+            {
+              /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+              src += backslash_and_quote::BYTES_PROCESSED;
+              dst += backslash_and_quote::BYTES_PROCESSED;
+            }
+          }
+          /* can't be reached */
+          return nullptr;
+        }
 
-    i += 1;
-    nonescape = (~nonescape) | (meta[1]);
-  }
-  dst_len = pos; // we intentionally do not work with a reference
-  // for fear of aliasing
-  return SUCCESS;
-}
+      } // namespace stringparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/stringparsing.h */
 
-} // namespace simdjson::fallback
+  } // namespace arm64
+} // namespace simdjson
 
-#endif // SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
-/* end file src/fallback/stage1_find_marks.h */
+#endif // SIMDJSON_ARM64_STRINGPARSING_H
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/arm64/numberparsing.h */
+#ifndef SIMDJSON_ARM64_NUMBERPARSING_H
+#define SIMDJSON_ARM64_NUMBERPARSING_H
+
+/* jsoncharutils.h already included: #include "jsoncharutils.h" */
+/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
+/* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */
+#include <cmath>
+#include <limits>
+
+#ifdef JSON_TEST_NUMBERS // for unit testing
+void found_invalid_number(const uint8_t *buf);
+void found_integer(int64_t result, const uint8_t *buf);
+void found_unsigned_integer(uint64_t result, const uint8_t *buf);
+void found_float(double result, const uint8_t *buf);
 #endif
-#if SIMDJSON_IMPLEMENTATION_HASWELL
-/* begin file src/haswell/stage1_find_marks.h */
-#ifndef SIMDJSON_HASWELL_STAGE1_FIND_MARKS_H
-#define SIMDJSON_HASWELL_STAGE1_FIND_MARKS_H
 
+namespace simdjson
+{
+  namespace arm64
+  {
 
-/* begin file src/haswell/bitmask.h */
-#ifndef SIMDJSON_HASWELL_BITMASK_H
-#define SIMDJSON_HASWELL_BITMASK_H
+    // we don't have SSE, so let us use a scalar function
+    // credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+    static inline uint32_t parse_eight_digits_unrolled(const char *chars)
+    {
+      uint64_t val;
+      memcpy(&val, chars, sizeof(uint64_t));
+      val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
+      val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
+      return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
+    }
 
+#define SWAR_NUMBER_PARSING
 
-/* begin file src/haswell/intrinsics.h */
-#ifndef SIMDJSON_HASWELL_INTRINSICS_H
-#define SIMDJSON_HASWELL_INTRINSICS_H
+    /* begin file src/generic/stage2/numberparsing.h */
+    namespace stage2
+    {
+      namespace numberparsing
+      {
 
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), writer.append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), writer.append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), writer.append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) writer.append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) writer.append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) writer.append_double((VALUE))
+#endif
 
-#ifdef _MSC_VER
-#include <intrin.h> // visual studio
+        // Attempts to compute i * 10^(power) exactly; and if "negative" is
+        // true, negate the result.
+        // This function will only work in some cases, when it does not work, success is
+        // set to false. This should work *most of the time* (like 99% of the time).
+        // We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
+        // FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
+        really_inline double compute_float_64(int64_t power, uint64_t i, bool negative, bool *success)
+        {
+          // we start with a fast path
+          // It was described in
+          // Clinger WD. How to read floating point numbers accurately.
+          // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+          // We cannot be certain that x/y is rounded to nearest.
+          if (0 <= power && power <= 22 && i <= 9007199254740991)
+          {
 #else
-#include <x86intrin.h> // elsewhere
-#endif // _MSC_VER
+          if (-22 <= power && power <= 22 && i <= 9007199254740991)
+          {
+#endif
+            // convert the integer into a double. This is lossless since
+            // 0 <= i <= 2^53 - 1.
+            double d = double(i);
+            //
+            // The general idea is as follows.
+            // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+            // 1) Both s and p can be represented exactly as 64-bit floating-point
+            // values
+            // (binary64).
+            // 2) Because s and p can be represented exactly as floating-point values,
+            // then s * p
+            // and s / p will produce correctly rounded values.
+            //
+            if (power < 0)
+            {
+              d = d / power_of_ten[-power];
+            }
+            else
+            {
+              d = d * power_of_ten[power];
+            }
+            if (negative)
+            {
+              d = -d;
+            }
+            *success = true;
+            return d;
+          }
+          // When 22 < power && power <  22 + 16, we could
+          // hope for another, secondary fast path.  It wa
+          // described by David M. Gay in  "Correctly rounded
+          // binary-decimal and decimal-binary conversions." (1990)
+          // If you need to compute i * 10^(22 + x) for x < 16,
+          // first compute i * 10^x, if you know that result is exact
+          // (e.g., when i * 10^x < 2^53),
+          // then you can still proceed and do (i * 10^x) * 10^22.
+          // Is this worth your time?
+          // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+          // for this second fast path to work.
+          // If you you have 22 < power *and* power <  22 + 16, and then you
+          // optimistically compute "i * 10^(x-22)", there is still a chance that you
+          // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+          // this optimization maybe less common than we would like. Source:
+          // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+          // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
 
-#endif // SIMDJSON_HASWELL_INTRINSICS_H
-/* end file src/haswell/intrinsics.h */
+          // The fast path has now failed, so we are failing back on the slower path.
 
-TARGET_HASWELL
-namespace simdjson::haswell {
+          // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+          // possible, except if i == 0, so we handle i == 0 separately.
+          if (i == 0)
+          {
+            return 0.0;
+          }
 
-//
-// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
-//
-// For example, prefix_xor(00100100) == 00011100
-//
-really_inline uint64_t prefix_xor(const uint64_t bitmask) {
-  // There should be no such thing with a processor supporting avx2
-  // but not clmul.
-  __m128i all_ones = _mm_set1_epi8('\xFF');
-  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
-  return _mm_cvtsi128_si64(result);
-}
+          // We are going to need to do some 64-bit arithmetic to get a more precise product.
+          // We use a table lookup approach.
+          components c =
+              power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
+          // safe because
+          // power >= FASTFLOAT_SMALLEST_POWER
+          // and power <= FASTFLOAT_LARGEST_POWER
+          // we recover the mantissa of the power, it has a leading 1. It is always
+          // rounded down.
+          uint64_t factor_mantissa = c.mantissa;
 
-} // namespace simdjson::haswell
-UNTARGET_REGION
+          // We want the most significant bit of i to be 1. Shift if needed.
+          int lz = leading_zeroes(i);
+          i <<= lz;
+          // We want the most significant 64 bits of the product. We know
+          // this will be non-zero because the most significant bit of i is
+          // 1.
+          value128 product = full_multiplication(i, factor_mantissa);
+          uint64_t lower = product.low;
+          uint64_t upper = product.high;
 
-#endif // SIMDJSON_HASWELL_BITMASK_H
-/* end file src/haswell/intrinsics.h */
-/* begin file src/haswell/simd.h */
-#ifndef SIMDJSON_HASWELL_SIMD_H
-#define SIMDJSON_HASWELL_SIMD_H
+          // We know that upper has at most one leading zero because
+          // both i and  factor_mantissa have a leading one. This means
+          // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
 
-/* simdprune_tables.h already included: #include "simdprune_tables.h" */
-/* begin file src/haswell/bitmanipulation.h */
-#ifndef SIMDJSON_HASWELL_BITMANIPULATION_H
-#define SIMDJSON_HASWELL_BITMANIPULATION_H
+          // As long as the first 9 bits of "upper" are not "1", then we
+          // know that we have an exact computed value for the leading
+          // 55 bits because any imprecision would play out as a +1, in
+          // the worst case.
+          if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower))
+          {
+            uint64_t factor_mantissa_low =
+                mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
+            // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
+            // result (three 64-bit values)
+            product = full_multiplication(i, factor_mantissa_low);
+            uint64_t product_low = product.low;
+            uint64_t product_middle2 = product.high;
+            uint64_t product_middle1 = lower;
+            uint64_t product_high = upper;
+            uint64_t product_middle = product_middle1 + product_middle2;
+            if (product_middle < product_middle1)
+            {
+              product_high++; // overflow carry
+            }
+            // We want to check whether mantissa *i + i would affect our result.
+            // This does happen, e.g. with 7.3177701707893310e+15.
+            if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
+                 (product_low + i < product_low)))
+            { // let us be prudent and bail out.
+              *success = false;
+              return 0;
+            }
+            upper = product_high;
+            lower = product_middle;
+          }
+          // The final mantissa should be 53 bits with a leading 1.
+          // We shift it so that it occupies 54 bits with a leading 1.
+          ///////
+          uint64_t upperbit = upper >> 63;
+          uint64_t mantissa = upper >> (upperbit + 9);
+          lz += int(1 ^ upperbit);
 
+          // Here we have mantissa < (1<<54).
 
-/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
+          // We have to round to even. The "to even" part
+          // is only a problem when we are right in between two floats
+          // which we guard against.
+          // If we have lots of trailing zeros, we may fall right between two
+          // floating-point values.
+          if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
+                       ((mantissa & 3) == 1)))
+          {
+            // if mantissa & 1 == 1 we might need to round up.
+            //
+            // Scenarios:
+            // 1. We are not in the middle. Then we should round up.
+            //
+            // 2. We are right in the middle. Whether we round up depends
+            // on the last significant bit: if it is "one" then we round
+            // up (round to even) otherwise, we do not.
+            //
+            // So if the last significant bit is 1, we can safely round up.
+            // Hence we only need to bail out if (mantissa & 3) == 1.
+            // Otherwise we may need more accuracy or analysis to determine whether
+            // we are exactly between two floating-point numbers.
+            // It can be triggered with 1e23.
+            // Note: because the factor_mantissa and factor_mantissa_low are
+            // almost always rounded down (except for small positive powers),
+            // almost always should round up.
+            *success = false;
+            return 0;
+          }
 
-TARGET_HASWELL
-namespace simdjson::haswell {
+          mantissa += mantissa & 1;
+          mantissa >>= 1;
 
-#ifndef _MSC_VER
-// We sometimes call trailing_zero on inputs that are zero,
-// but the algorithms do not end up using the returned value.
-// Sadly, sanitizers are not smart enough to figure it out.
-__attribute__((no_sanitize("undefined")))  // this is deliberate
-#endif
-really_inline int trailing_zeroes(uint64_t input_num) {
-#ifdef _MSC_VER
-  return (int)_tzcnt_u64(input_num);
-#else
-  ////////
-  // You might expect the next line to be equivalent to 
-  // return (int)_tzcnt_u64(input_num);
-  // but the generated code differs and might be less efficient?
-  ////////
-  return __builtin_ctzll(input_num);
-#endif// _MSC_VER
-}
+          // Here we have mantissa < (1<<53), unless there was an overflow
+          if (mantissa >= (1ULL << 53))
+          {
+            //////////
+            // This will happen when parsing values such as 7.2057594037927933e+16
+            ////////
+            mantissa = (1ULL << 52);
+            lz--; // undo previous addition
+          }
+          mantissa &= ~(1ULL << 52);
+          uint64_t real_exponent = c.exp - lz;
+          // we have to check that real_exponent is in range, otherwise we bail out
+          if (unlikely((real_exponent < 1) || (real_exponent > 2046)))
+          {
+            *success = false;
+            return 0;
+          }
+          mantissa |= real_exponent << 52;
+          mantissa |= (((uint64_t)negative) << 63);
+          double d;
+          memcpy(&d, &mantissa, sizeof(d));
+          *success = true;
+          return d;
+        } // namespace numberparsing
 
-/* result might be undefined when input_num is zero */
-really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
-  return _blsr_u64(input_num);
-}
+        static bool parse_float_strtod(const char *ptr, double *outDouble)
+        {
+          char *endptr;
+          *outDouble = strtod(ptr, &endptr);
+          // Some libraries will set errno = ERANGE when the value is subnormal,
+          // yet we may want to be able to parse subnormal values.
+          // However, we do not want to tolerate NAN or infinite values.
+          //
+          // Values like infinity or NaN are not allowed in the JSON specification.
+          // If you consume a large value and you map it to "infinity", you will no
+          // longer be able to serialize back a standard-compliant JSON. And there is
+          // no realistic application where you might need values so large than they
+          // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+          // 10^308 It is an unimaginable large number. There will never be any piece of
+          // engineering involving as many as 10^308 parts. It is estimated that there
+          // are about 10^80 atoms in the universe.  The estimate for the total number
+          // of electrons is similar. Using a double-precision floating-point value, we
+          // can represent easily the number of atoms in the universe. We could  also
+          // represent the number of ways you can pick any three individual atoms at
+          // random in the universe. If you ever encounter a number much larger than
+          // 10^308, you know that you have a bug. RapidJSON will reject a document with
+          // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
+          // will flat out throw an exception.
+          //
+          if ((endptr == ptr) || (!std::isfinite(*outDouble)))
+          {
+            return false;
+          }
+          return true;
+        }
 
-/* result might be undefined when input_num is zero */
-really_inline int leading_zeroes(uint64_t input_num) {
-  return static_cast<int>(_lzcnt_u64(input_num));
-}
+        really_inline bool is_integer(char c)
+        {
+          return (c >= '0' && c <= '9');
+          // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
+        }
 
-really_inline int count_ones(uint64_t input_num) {
-#ifdef _MSC_VER
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
-#else
-  return _popcnt64(input_num);
-#endif
-}
+        // check quickly whether the next 8 chars are made of digits
+        // at a glance, it looks better than Mula's
+        // http://0x80.pl/articles/swar-digits-validate.html
+        really_inline bool is_made_of_eight_digits_fast(const char *chars)
+        {
+          uint64_t val;
+          // this can read up to 7 bytes beyond the buffer size, but we require
+          // SIMDJSON_PADDING of padding
+          static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+          memcpy(&val, chars, 8);
+          // a branchy method might be faster:
+          // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+          //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+          //  0x3030303030303030);
+          return (((val & 0xF0F0F0F0F0F0F0F0) |
+                   (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+                  0x3333333333333333);
+        }
 
-really_inline bool add_overflow(uint64_t value1, uint64_t value2,
-                                uint64_t *result) {
-#ifdef _MSC_VER
-  return _addcarry_u64(0, value1, value2,
-                       reinterpret_cast<unsigned __int64 *>(result));
-#else
-  return __builtin_uaddll_overflow(value1, value2,
-                                   (unsigned long long *)result);
-#endif
-}
+        template <typename W>
+        bool slow_float_parsing(UNUSED const char *src, W writer)
+        {
+          double d;
+          if (parse_float_strtod(src, &d))
+          {
+            WRITE_DOUBLE(d, (const uint8_t *)src, writer);
+            return true;
+          }
+          return INVALID_NUMBER((const uint8_t *)src);
+        }
 
-#ifdef _MSC_VER
-#pragma intrinsic(_umul128)
+        really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent)
+        {
+          // we continue with the fiction that we have an integer. If the
+          // floating point number is representable as x * 10^z for some integer
+          // z that fits in 53 bits, then we will be able to convert back the
+          // the integer into a float in a lossless manner.
+          const char *const first_after_period = p;
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          } // There must be at least one digit after the .
+
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          ++p;
+          i = i * 10 + digit; // might overflow + multiplication by 10 is likely
+                              // cheaper than arbitrary mult.
+          // we will handle the overflow later
+#ifdef SWAR_NUMBER_PARSING
+          // this helps if we have lots of decimals!
+          // this turns out to be frequent enough.
+          if (is_made_of_eight_digits_fast(p))
+          {
+            i = i * 100000000 + parse_eight_digits_unrolled(p);
+            p += 8;
+          }
 #endif
-really_inline bool mul_overflow(uint64_t value1, uint64_t value2,
-                                uint64_t *result) {
-#ifdef _MSC_VER
-  uint64_t high;
-  *result = _umul128(value1, value2, &high);
-  return high;
-#else
-  return __builtin_umulll_overflow(value1, value2,
-                                   (unsigned long long *)result);
-#endif
-}
-}// namespace simdjson::haswell
-UNTARGET_REGION
+          while (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            ++p;
+            i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+                                // because we have parse_highprecision_float later.
+          }
+          exponent = first_after_period - p;
+          return true;
+        }
 
-#endif // SIMDJSON_HASWELL_BITMANIPULATION_H
-/* end file src/haswell/bitmanipulation.h */
-/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
+        really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent)
+        {
+          bool neg_exp = false;
+          if ('-' == *p)
+          {
+            neg_exp = true;
+            ++p;
+          }
+          else if ('+' == *p)
+          {
+            ++p;
+          }
 
-TARGET_HASWELL
-namespace simdjson::haswell::simd {
+          // e[+-] must be followed by a number
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          }
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          int64_t exp_number = digit;
+          p++;
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          while (is_integer(*p))
+          {
+            // we need to check for overflows; we refuse to parse this
+            if (exp_number > 0x100000000)
+            {
+              return INVALID_NUMBER(src);
+            }
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          exponent += (neg_exp ? -exp_number : exp_number);
+          return true;
+        }
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename Child>
-  struct base {
-    __m256i value;
+        template <typename W>
+        really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char *start_digits, int digit_count, int64_t exponent, W &writer)
+        {
+          // If we frequently had to deal with long strings of digits,
+          // we could extend our code by using a 128-bit integer instead
+          // of a 64-bit integer. However, this is uncommon in practice.
+          // digit count is off by 1 because of the decimal (assuming there was one).
+          if (unlikely((digit_count - 1 >= 19)))
+          { // this is uncommon
+            // It is possible that the integer had an overflow.
+            // We have to handle the case where we have 0.0000somenumber.
+            const char *start = start_digits;
+            while ((*start == '0') || (*start == '.'))
+            {
+              start++;
+            }
+            // we over-decrement by one when there is a '.'
+            digit_count -= int(start - start_digits);
+            if (digit_count >= 19)
+            {
+              // Ok, chances are good that we had an overflow!
+              // this is almost never going to get called!!!
+              // we start anew, going slowly!!!
+              // This will happen in the following examples:
+              // 10000000000000000000000000000000000000000000e+308
+              // 3.1415926535897932384626433832795028841971693993751
+              //
+              bool success = slow_float_parsing((const char *)src, writer);
+              // The number was already written, but we made a copy of the writer
+              // when we passed it to the parse_large_integer() function, so
+              writer.skip_double();
+              return success;
+            }
+          }
+          // NOTE: it's weird that the unlikely() only wraps half the if, but it seems to get slower any other
+          // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+          // To future reader: we'd love if someone found a better way, or at least could explain this result!
+          if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER))
+          {
+            // this is almost never going to get called!!!
+            // we start anew, going slowly!!!
+            bool success = slow_float_parsing((const char *)src, writer);
+            // The number was already written, but we made a copy of the writer when we passed it to the
+            // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+            writer.skip_double();
+            return success;
+          }
+          bool success = true;
+          double d = compute_float_64(exponent, i, negative, &success);
+          if (!success)
+          {
+            // we are almost never going to get here.
+            if (!parse_float_strtod((const char *)src, &d))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          WRITE_DOUBLE(d, src, writer);
+          return true;
+        }
 
-    // Zero constructor
-    really_inline base() : value{__m256i()} {}
+        // parse the number at src
+        // define JSON_TEST_NUMBERS for unit testing
+        //
+        // It is assumed that the number is followed by a structural ({,},],[) character
+        // or a white space character. If that is not the case (e.g., when the JSON
+        // document is made of a single number), then it is necessary to copy the
+        // content and append a space before calling this function.
+        //
+        // Our objective is accurate parsing (ULP of 0) at high speed.
+        template <typename W>
+        really_inline bool parse_number(UNUSED const uint8_t *const src,
+                                        UNUSED bool found_minus,
+                                        W &writer)
+        {
+#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes \
+                                  // useful to skip parsing
+          writer.append_s64(0);   // always write zero
+          return true;            // always succeeds
+#else
+          const char *p = reinterpret_cast<const char *>(src);
+          bool negative = false;
+          if (found_minus)
+          {
+            ++p;
+            negative = true;
+            // a negative sign must be followed by an integer
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          const char *const start_digits = p;
 
-    // Conversion from SIMD register
-    really_inline base(const __m256i _value) : value(_value) {}
+          uint64_t i; // an unsigned int avoids signed overflows (which are bad)
+          if (*p == '0')
+          {
+            ++p;
+            if (is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // 0 cannot be followed by an integer
+            i = 0;
+          }
+          else
+          {
+            // NOTE: This is a redundant check--either we're negative, in which case we checked whether this
+            // is a digit above, or the caller already determined we start with a digit. But removing this
+            // check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
+            // Please do try yourself, or think of ways to explain it--we'd love to understand :)
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // must start with an integer
+            unsigned char digit = static_cast<unsigned char>(*p - '0');
+            i = digit;
+            p++;
+            // the is_made_of_eight_digits_fast routine is unlikely to help here because
+            // we rarely see large integer parts like 123456789
+            while (is_integer(*p))
+            {
+              digit = static_cast<unsigned char>(*p - '0');
+              // a multiplication by 10 is cheaper than an arbitrary integer
+              // multiplication
+              i = 10 * i + digit; // might overflow, we will handle the overflow later
+              ++p;
+            }
+          }
 
-    // Conversion to SIMD register
-    really_inline operator const __m256i&() const { return this->value; }
-    really_inline operator __m256i&() { return this->value; }
+          //
+          // Handle floats if there is a . or e (or both)
+          //
+          int64_t exponent = 0;
+          bool is_float = false;
+          if ('.' == *p)
+          {
+            is_float = true;
+            ++p;
+            if (!parse_decimal(src, p, i, exponent))
+            {
+              return false;
+            }
+          }
+          int digit_count = int(p - start_digits); // used later to guard against overflows
+          if (('e' == *p) || ('E' == *p))
+          {
+            is_float = true;
+            ++p;
+            if (!parse_exponent(src, p, exponent))
+            {
+              return false;
+            }
+          }
+          if (is_float)
+          {
+            return write_float(src, negative, i, start_digits, digit_count, exponent, writer);
+          }
 
-    // Bit operations
-    really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
-    really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
-    really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
-    really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
-    really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; }
-    really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; }
-    really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; }
-  };
+          // The longest negative 64-bit number is 19 digits.
+          // The longest positive 64-bit number is 20 digits.
+          // We do it this way so we don't trigger this branch unless we must.
+          int longest_digit_count = negative ? 19 : 20;
+          if (digit_count > longest_digit_count)
+          {
+            return INVALID_NUMBER(src);
+          }
+          if (digit_count == longest_digit_count)
+          {
+            // Anything negative above INT64_MAX is either invalid or INT64_MIN.
+            if (negative && i > uint64_t(INT64_MAX))
+            {
+              // If the number is negative and can't fit in a signed integer, it's invalid.
+              if (i > uint64_t(INT64_MAX) + 1)
+              {
+                return INVALID_NUMBER(src);
+              }
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename T>
-  struct simd8;
+              // If it's negative, it has to be INT64_MAX+1 now (or INT64_MIN).
+              // C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
+              WRITE_INTEGER(INT64_MIN, src, writer);
+              return is_structural_or_whitespace(*p);
+            }
 
-  template<typename T, typename Mask=simd8<bool>>
-  struct base8: base<simd8<T>> {
-    typedef uint32_t bitmask_t;
-    typedef uint64_t bitmask2_t;
+            // Positive overflow check:
+            // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+            //   biggest uint64_t.
+            // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+            //   If we got here, it's a 20 digit number starting with the digit "1".
+            // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+            //   than 1,553,255,926,290,448,384.
+            // - That is smaller than the smallest possible 20-digit number the user could write:
+            //   10,000,000,000,000,000,000.
+            // - Therefore, if the number is positive and lower than that, it's overflow.
+            // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
+            //
+            if (!negative && (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
 
-    really_inline base8() : base<simd8<T>>() {}
-    really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+          // Write unsigned if it doesn't fit in a signed integer.
+          if (i > uint64_t(INT64_MAX))
+          {
+            WRITE_UNSIGNED(i, src, writer);
+          }
+          else
+          {
+            WRITE_INTEGER(negative ? 0 - i : i, src, writer);
+          }
+          return is_structural_or_whitespace(*p);
 
-    really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); }
+#endif // SIMDJSON_SKIPNUMBERPARSING
+        }
 
-    static const int SIZE = sizeof(base<T>::value);
+      } // namespace numberparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/numberparsing.h */
 
-    template<int N=1>
-    really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
-    }
-  };
+  } // namespace arm64
+} // namespace simdjson
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base8<bool> {
-    static really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(-(!!_value)); }
+#endif // SIMDJSON_ARM64_NUMBERPARSING_H
+/* end file src/generic/stage2/numberparsing.h */
 
-    really_inline simd8<bool>() : base8() {}
-    really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
-    // Splat constructor
-    really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+namespace simdjson
+{
+  namespace arm64
+  {
 
-    really_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); }
-    really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
-    really_inline simd8<bool> operator~() const { return *this ^ true; }
-  };
+    /* begin file src/generic/stage2/logger.h */
+    // This is for an internal-only stage 2 specific logger.
+    // Set LOG_ENABLED = true to log what stage 2 is doing!
+    namespace logger
+    {
+      static constexpr const char *DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
 
-  template<typename T>
-  struct base8_numeric: base8<T> {
-    static really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
-    static really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
-    static really_inline simd8<T> load(const T values[32]) {
-      return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
-    }
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    static really_inline simd8<T> repeat_16(
-      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
-      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
-    ) {
-      return simd8<T>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+      static constexpr const bool LOG_ENABLED = false;
+      static constexpr const int LOG_EVENT_LEN = 30;
+      static constexpr const int LOG_BUFFER_LEN = 20;
+      static constexpr const int LOG_DETAIL_LEN = 50;
+      static constexpr const int LOG_INDEX_LEN = 10;
 
-    really_inline base8_numeric() : base8<T>() {}
-    really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
+      static int log_depth; // Not threadsafe. Log only.
 
-    // Store to array
-    really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+      // Helper to turn unprintable or newline characters into spaces
+      static really_inline char printable_char(char c)
+      {
+        if (c >= 0x20)
+        {
+          return c;
+        }
+        else
+        {
+          return ' ';
+        }
+      }
 
-    // Addition/subtraction are the same for signed and unsigned
-    really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
-    really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
-    really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *(simd8<T>*)this; }
-    really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *(simd8<T>*)this; }
+      // Print the header and set up log_start
+      static really_inline void log_start()
+      {
+        if (LOG_ENABLED)
+        {
+          log_depth = 0;
+          printf("\n");
+          printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+          printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN + 2, DASHES, LOG_BUFFER_LEN + 2, DASHES, 4 + 2, DASHES, 4 + 2, DASHES, 5 + 2, DASHES, 5 + 2, DASHES, LOG_DETAIL_LEN + 2, DASHES, LOG_INDEX_LEN + 2, DASHES);
+        }
+      }
 
-    // Override to distinguish from bool version
-    really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+      static really_inline void log_string(const char *message)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("%s\n", message);
+        }
+      }
 
-    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
-    template<typename L>
-    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return _mm256_shuffle_epi8(lookup_table, *this);
-    }
+      // Logs a single line of
+      template <typename S>
+      static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("| %*s%s%-*s ", log_depth * 2, "", title_prefix, LOG_EVENT_LEN - log_depth * 2 - int(strlen(title_prefix)), title);
+          {
+            // Print the next N characters in the buffer.
+            printf("| ");
+            // Otherwise, print the characters starting from the buffer position.
+            // Print spaces for unprintable or newline characters.
+            for (int i = 0; i < LOG_BUFFER_LEN; i++)
+            {
+              printf("%c", printable_char(structurals.current()[i]));
+            }
+            printf(" ");
+          }
+          printf("|    %c ", printable_char(structurals.current_char()));
+          printf("|    %c ", printable_char(structurals.peek_next_char()));
+          printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural + 1)]);
+          printf("| %5u ", structurals.next_tape_index());
+          printf("| %-*s ", LOG_DETAIL_LEN, detail);
+          printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+          printf("|\n");
+        }
+      }
+    } // namespace logger
 
-    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
-    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
-    // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
-    // get written.
-    // Design consideration: it seems like a function with the
-    // signature simd8<L> compress(uint32_t mask) would be
-    // sensible, but the AVX ISA makes this kind of approach difficult.
-    template<typename L>
-    really_inline void compress(uint32_t mask, L * output) const {
-      // this particular implementation was inspired by work done by @animetosho
-      // we do it in four steps, first 8 bytes and then second 8 bytes...
-      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
-      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // second least significant 8 bits
-      uint8_t mask3 = static_cast<uint8_t>(mask >> 16); // ...
-      uint8_t mask4 = static_cast<uint8_t>(mask >> 24); // ...
-      // next line just loads the 64-bit values thintable_epi8[mask1] and
-      // thintable_epi8[mask2] into a 128-bit register, using only
-      // two instructions on most compilers.
-      __m256i shufmask =  _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3], 
-        thintable_epi8[mask2], thintable_epi8[mask1]);
-      // we increment by 0x08 the second half of the mask and so forth
-      shufmask =
-      _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818, 
-         0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
-      // this is the version "nearly pruned"
-      __m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
-      // we still need to put the  pieces back together.
-      // we compute the popcount of the first words:
-      int pop1 = BitsSetTable256mul2[mask1];
-      int pop3 = BitsSetTable256mul2[mask3];
+    /* end file src/generic/stage2/logger.h */
+    /* begin file src/generic/stage2/atomparsing.h */
+    namespace stage2
+    {
+      namespace atomparsing
+      {
 
-      // then load the corresponding mask
-      // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
-      __m256i v256 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8)));
-      __m256i compactmask = _mm256_insertf128_si256(v256,
-         _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop3 * 8)), 1);
-      __m256i almostthere =  _mm256_shuffle_epi8(pruned, compactmask);
-      // We just need to write out the result.
-      // This is the tricky bit that is hard to do
-      // if we want to return a SIMD register, since there
-      // is no single-instruction approach to recombine
-      // the two 128-bit lanes with an offset.
-      __m128i v128;
-      v128 = _mm256_castsi256_si128(almostthere);
-      _mm_storeu_si128( (__m128i *)output, v128);
-      v128 = _mm256_extractf128_si256(almostthere, 1);
-      _mm_storeu_si128( (__m128i *)(output + 16 - count_ones(mask & 0xFFFF)), v128);
-    }
+        really_inline uint32_t string_to_uint32(const char *str) { return *reinterpret_cast<const uint32_t *>(str); }
 
-    template<typename L>
-    really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
-  };
+        WARN_UNUSED
+        really_inline uint32_t str4ncmp(const uint8_t *src, const char *atom)
+        {
+          uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+          static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+          std::memcpy(&srcval, src, sizeof(uint32_t));
+          return srcval ^ string_to_uint32(atom);
+        }
 
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> : base8_numeric<int8_t> {
-    really_inline simd8() : base8_numeric<int8_t>() {}
-    really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
-    // Splat constructor
-    really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
-    // Array constructor
-    really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
-    // Member-by-member initialization
-    really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
-      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
-      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
-    ) : simd8(_mm256_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15,
-      v16,v17,v18,v19,v20,v21,v22,v23,
-      v24,v25,v26,v27,v28,v29,v30,v31
-    )) {}
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
 
-    // Order-sensitive comparisons
-    really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
-    really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
-    really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
-    really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
-  };
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_true_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "true");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base8_numeric<uint8_t> {
-    really_inline simd8() : base8_numeric<uint8_t>() {}
-    really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
-    // Splat constructor
-    really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
-    // Array constructor
-    really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
-    // Member-by-member initialization
-    really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
-      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
-      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
-    ) : simd8(_mm256_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15,
-      v16,v17,v18,v19,v20,v21,v22,v23,
-      v24,v25,v26,v27,v28,v29,v30,v31
-    )) {}
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src + 1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
+        }
 
-    // Saturated math
-    really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
-    really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 5)
+          {
+            return is_valid_false_atom(src);
+          }
+          else if (len == 5)
+          {
+            return !str4ncmp(src + 1, "alse");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-    // Order-specific operations
-    really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
-    really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
-    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-    really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
-    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-    really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
-    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; }
-    really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; }
-    really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
-    really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
 
-    // Bit-specific operations
-    really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
-    really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
-    really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
-    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
-    really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
-    really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-    really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
-    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
-    template<int N>
-    really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
-    template<int N>
-    really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
-    // Get one of the bits and make a bitmask out of it.
-    // e.g. value.get_bit<7>() gets the high bit
-    template<int N>
-    really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
-  };
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_null_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "null");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-  template<typename T>
-  struct simd8x64 {
-    static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
-    const simd8<T> chunks[NUM_CHUNKS];
+      } // namespace atomparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/atomparsing.h */
+    /* begin file src/generic/stage2/structural_iterator.h */
+    namespace stage2
+    {
 
-    really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {}
-    really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
-    really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
+      class structural_iterator
+      {
+      public:
+        const uint8_t *const buf;
+        uint32_t *current_structural;
+        dom_parser_implementation &parser;
 
-    template <typename F>
-    static really_inline void each_index(F const& each) {
-      each(0);
-      each(1);
-    }
+        // Start a structural
+        really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+            : buf{_parser.buf},
+              current_structural{&_parser.structural_indexes[start_structural_index]},
+              parser{_parser}
+        {
+        }
+        // Get the buffer position of the current structural character
+        really_inline const uint8_t *current()
+        {
+          return &buf[*current_structural];
+        }
+        // Get the current structural character
+        really_inline char current_char()
+        {
+          return buf[*current_structural];
+        }
+        // Get the next structural character without advancing
+        really_inline char peek_next_char()
+        {
+          return buf[*(current_structural + 1)];
+        }
+        really_inline char advance_char()
+        {
+          current_structural++;
+          return buf[*current_structural];
+        }
+        really_inline size_t remaining_len()
+        {
+          return parser.len - *current_structural;
+        }
 
-    really_inline void compress(uint64_t mask, T * output) const {
-      uint32_t mask1 = static_cast<uint32_t>(mask);
-      uint32_t mask2 = static_cast<uint32_t>(mask >> 32);
-      this->chunks[0].compress(mask1, output);
-      this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
-    }
+        really_inline bool past_end(uint32_t n_structural_indexes)
+        {
+          return current_structural >= &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_end(uint32_t n_structural_indexes)
+        {
+          return current_structural == &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_beginning()
+        {
+          return current_structural == parser.structural_indexes.get();
+        }
+      };
 
-    really_inline void store(T ptr[64]) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
-    }
+    } // namespace stage2
+    /* end file src/generic/stage2/structural_iterator.h */
+    /* begin file src/generic/stage2/structural_parser.h */
+    // This file contains the common code every implementation uses for stage2
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "simdjson/stage2.h" (this simplifies amalgation)
 
-    template <typename F>
-    really_inline void each(F const& each_chunk) const
+    namespace stage2
     {
-      each_chunk(this->chunks[0]);
-      each_chunk(this->chunks[1]);
-    }
+      namespace
+      { // Make everything here private
 
-    template <typename R=bool, typename F>
-    really_inline simd8x64<R> map(F const& map_chunk) const {
-      return simd8x64<R>(
-        map_chunk(this->chunks[0]),
-        map_chunk(this->chunks[1])
-      );
-    }
+        /* begin file src/generic/stage2/tape_writer.h */
+        struct tape_writer
+        {
+          /** The next place to write to tape */
+          uint64_t *next_tape_loc;
 
-    
+          /** Write a signed 64-bit value to tape. */
+          really_inline void append_s64(int64_t value) noexcept;
 
-    template <typename R=bool, typename F>
-    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
-      return simd8x64<R>(
-        map_chunk(this->chunks[0], b.chunks[0]),
-        map_chunk(this->chunks[1], b.chunks[1])
-      );
-    }
+          /** Write an unsigned 64-bit value to tape. */
+          really_inline void append_u64(uint64_t value) noexcept;
 
-    template <typename F>
-    really_inline simd8<T> reduce(F const& reduce_pair) const {
-      return reduce_pair(this->chunks[0], this->chunks[1]);
-    }
+          /** Write a double value to tape. */
+          really_inline void append_double(double value) noexcept;
 
-    really_inline uint64_t to_bitmask() const {
-      uint64_t r_lo = static_cast<uint32_t>(this->chunks[0].to_bitmask());
-      uint64_t r_hi =                       this->chunks[1].to_bitmask();
-      return r_lo | (r_hi << 32);
-    }
+          /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+          really_inline void append(uint64_t val, internal::tape_type t) noexcept;
 
-    really_inline simd8x64<T> bit_or(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a | mask; } );
-    }
+          /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+          really_inline void skip() noexcept;
 
-    really_inline uint64_t eq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a == mask; } ).to_bitmask();
-    }
+          /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+          really_inline void skip_large_integer() noexcept;
 
-    really_inline uint64_t lteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
-    }
-  }; // struct simd8x64<T>
+          /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+          really_inline void skip_double() noexcept;
 
-} // namespace simdjson::haswell::simd
-UNTARGET_REGION
+          /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+          really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
 
-#endif // SIMDJSON_HASWELL_SIMD_H
-/* end file src/haswell/bitmanipulation.h */
-/* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
-/* haswell/implementation.h already included: #include "haswell/implementation.h" */
+        private:
+          /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+          template <typename T>
+          really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+        }; // struct number_writer
 
-TARGET_HASWELL
-namespace simdjson::haswell {
+        really_inline void tape_writer::append_s64(int64_t value) noexcept
+        {
+          append2(0, value, internal::tape_type::INT64);
+        }
 
-using namespace simd;
+        really_inline void tape_writer::append_u64(uint64_t value) noexcept
+        {
+          append(0, internal::tape_type::UINT64);
+          *next_tape_loc = value;
+          next_tape_loc++;
+        }
 
-struct json_character_block {
-  static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
+        /** Write a double value to tape. */
+        really_inline void tape_writer::append_double(double value) noexcept
+        {
+          append2(0, value, internal::tape_type::DOUBLE);
+        }
 
-  really_inline uint64_t whitespace() const { return _whitespace; }
-  really_inline uint64_t op() const { return _op; }
-  really_inline uint64_t scalar() { return ~(op() | whitespace()); }
+        really_inline void tape_writer::skip() noexcept
+        {
+          next_tape_loc++;
+        }
 
-  uint64_t _whitespace;
-  uint64_t _op;
-};
+        really_inline void tape_writer::skip_large_integer() noexcept
+        {
+          next_tape_loc += 2;
+        }
 
-really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
-  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
-  // we can't use the generic lookup_16.
-  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
-  auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
+        really_inline void tape_writer::skip_double() noexcept
+        {
+          next_tape_loc += 2;
+        }
 
-  // We compute whitespace and op separately. If the code later only use one or the
-  // other, given the fact that all functions are aggressively inlined, we can
-  // hope that useless computations will be omitted. This is namely case when
-  // minifying (we only need whitespace).
+        really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept
+        {
+          *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+          next_tape_loc++;
+        }
 
-  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
-    return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in));
-  }).to_bitmask();
+        template <typename T>
+        really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept
+        {
+          append(val, t);
+          static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+          memcpy(next_tape_loc, &val2, sizeof(val2));
+          next_tape_loc++;
+        }
 
-  uint64_t op = in.map([&](simd8<uint8_t> _in) {
-    // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
-    return (_in | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, _in-','));
-  }).to_bitmask();
-  return { whitespace, op };
-}
+        really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept
+        {
+          tape_loc = val | ((uint64_t(char(t))) << 56);
+        }
+        /* end file src/generic/stage2/tape_writer.h */
 
-really_inline bool is_ascii(simd8x64<uint8_t> input) {
-  simd8<uint8_t> bits = input.reduce([&](auto a,auto b) { return a|b; });
-  return !bits.any_bits_set_anywhere(0b10000000u);
-}
-
-really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
-}
-
-/* begin file src/generic/buf_block_reader.h */
-// Walks through a buffer in block-sized increments, loading the last part with spaces
-template<size_t STEP_SIZE>
-struct buf_block_reader {
-public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-  really_inline size_t block_index() { return idx; }
-  really_inline bool has_full_block() const {
-    return idx < lenminusstep;
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+#define INIT_ADDRESSES()                                                                  \
+  {                                                                                       \
+    &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue \
   }
-  really_inline const uint8_t *full_block() const {
-    return &buf[idx];
+#define GOTO(address) \
+  {                   \
+    goto *(address);  \
   }
-  really_inline bool has_remainder() const {
-    return idx < len;
+#define CONTINUE(address) \
+  {                       \
+    goto *(address);      \
   }
-  really_inline void get_remainder(uint8_t *tmp_buf) const {
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
+#else // SIMDJSON_USE_COMPUTED_GOTO
+#define INIT_ADDRESSES() {'[', 'a', 'e', 'f', '{', 'o'};
+#define GOTO(address)       \
+  {                         \
+    switch (address)        \
+    {                       \
+    case '[':               \
+      goto array_begin;     \
+    case 'a':               \
+      goto array_continue;  \
+    case 'e':               \
+      goto error;           \
+    case 'f':               \
+      goto finish;          \
+    case '{':               \
+      goto object_begin;    \
+    case 'o':               \
+      goto object_continue; \
+    }                       \
   }
-  really_inline void advance() {
-    idx += STEP_SIZE;
+// For the more constrained end_xxx() situation
+#define CONTINUE(address)   \
+  {                         \
+    switch (address)        \
+    {                       \
+    case 'a':               \
+      goto array_continue;  \
+    case 'o':               \
+      goto object_continue; \
+    case 'f':               \
+      goto finish;          \
+    }                       \
   }
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
+#endif // SIMDJSON_USE_COMPUTED_GOTO
 
-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  in.store((uint8_t*)buf);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
+        struct unified_machine_addresses
+        {
+          ret_address_t array_begin;
+          ret_address_t array_continue;
+          ret_address_t error;
+          ret_address_t finish;
+          ret_address_t object_begin;
+          ret_address_t object_continue;
+        };
 
-UNUSED static char * format_mask(uint64_t mask) {
-  static char *buf = (char*)malloc(64 + 1);
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+#undef FAIL_IF
+#define FAIL_IF(EXPR)         \
+  {                           \
+    if (EXPR)                 \
+    {                         \
+      return addresses.error; \
+    }                         \
   }
-  buf[64] = '\0';
-  return buf;
-}
-/* end file src/generic/buf_block_reader.h */
-/* begin file src/generic/json_string_scanner.h */
-namespace stage1 {
 
-struct json_string_block {
-  // Escaped characters (characters following an escape() character)
-  really_inline uint64_t escaped() const { return _escaped; }
-  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
-  really_inline uint64_t escape() const { return _backslash & ~_escaped; }
-  // Real (non-backslashed) quotes
-  really_inline uint64_t quote() const { return _quote; }
-  // Start quotes of strings
-  really_inline uint64_t string_end() const { return _quote & _in_string; }
-  // End quotes of strings
-  really_inline uint64_t string_start() const { return _quote & ~_in_string; }
-  // Only characters inside the string (not including the quotes)
-  really_inline uint64_t string_content() const { return _in_string & ~_quote; }
-  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
-  really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
-  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
-  really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
-  // Tail of string (everything except the start quote)
-  really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+        struct structural_parser : structural_iterator
+        {
+          /** Lets you append to the tape */
+          tape_writer tape;
+          /** Next write location in the string buf for stage 2 parsing */
+          uint8_t *current_string_buf_loc;
+          /** Current depth (nested objects and arrays) */
+          uint32_t depth{0};
 
-  // backslash characters
-  uint64_t _backslash;
-  // escaped characters (backslashed--does not include the hex characters after \u)
-  uint64_t _escaped;
-  // real quotes (non-backslashed ones)
-  uint64_t _quote;
-  // string characters (includes start quote but not end quote)
-  uint64_t _in_string;
-};
+          // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+          really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+              : structural_iterator(_parser, start_structural_index),
+                tape{parser.doc->tape.get()},
+                current_string_buf_loc{parser.doc->string_buf.get()}
+          {
+          }
 
-// Scans blocks for string characters, storing the state necessary to do so
-class json_string_scanner {
-public:
-  really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
-  really_inline error_code finish(bool streaming);
+          WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state)
+          {
+            parser.containing_scope[depth].tape_index = next_tape_index();
+            parser.containing_scope[depth].count = 0;
+            tape.skip(); // We don't actually *write* the start element until the end.
+            parser.ret_address[depth] = continue_state;
+            depth++;
+            bool exceeded_max_depth = depth >= parser.max_depth();
+            if (exceeded_max_depth)
+            {
+              log_error("Exceeded max depth!");
+            }
+            return exceeded_max_depth;
+          }
 
-private:
-  really_inline uint64_t find_escaped(uint64_t escape);
+          WARN_UNUSED really_inline bool start_document(ret_address_t continue_state)
+          {
+            log_start_value("document");
+            return start_scope(continue_state);
+          }
 
-  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
-  uint64_t prev_in_string = 0ULL;
-  // Whether the first character of the next iteration is escaped.
-  uint64_t prev_escaped = 0ULL;
-};
+          WARN_UNUSED really_inline bool start_object(ret_address_t continue_state)
+          {
+            log_start_value("object");
+            return start_scope(continue_state);
+          }
 
-//
-// Finds escaped characters (characters following \).
-//
-// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
-//
-// Does this by:
-// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
-// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
-// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
-//
-// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
-// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
-// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
-// the start bit causes a carry), and leaves even-bit sequences alone.
-//
-// Example:
-//
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
-// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
-// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
-// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
-// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
-// escaped        |   x  | x x  x x  x x  x  x  |
-// desired        |   x  | x x  x x  x x  x  x  |
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-//
-really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
-  // If there was overflow, pretend the first character isn't a backslash
-  backslash &= ~prev_escaped;
-  uint64_t follows_escape = backslash << 1 | prev_escaped;
+          WARN_UNUSED really_inline bool start_array(ret_address_t continue_state)
+          {
+            log_start_value("array");
+            return start_scope(continue_state);
+          }
 
-  // Get sequences starting on even bits by clearing out the odd series using +
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
-  uint64_t sequences_starting_on_even_bits;
-  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
-  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+          // this function is responsible for annotating the start of the scope
+          really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept
+          {
+            depth--;
+            // write our doc->tape location to the header scope
+            // The root scope gets written *at* the previous location.
+            tape.append(parser.containing_scope[depth].tape_index, end);
+            // count can overflow if it exceeds 24 bits... so we saturate
+            // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+            const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+            const uint32_t count = parser.containing_scope[depth].count;
+            const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+            // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+            tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+          }
 
-  // Mask every other backslashed character as an escaped character
-  // Flip the mask for sequences that start on even bits, to correct them
-  return (even_bits ^ invert_mask) & follows_escape;
-}
+          really_inline uint32_t next_tape_index()
+          {
+            return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+          }
 
-//
-// Return a mask of all string characters plus end quotes.
-//
-// prev_escaped is overflow saying whether the next character is escaped.
-// prev_in_string is overflow saying whether we're still in a string.
-//
-// Backslash sequences outside of quotes will be detected in stage 2.
-//
-really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
-  const uint64_t backslash = in.eq('\\');
-  const uint64_t escaped = find_escaped(backslash);
-  const uint64_t quote = in.eq('"') & ~escaped;
-  // prefix_xor flips on bits inside the string (and flips off the end quote).
-  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
-  // (characters inside strings are outside, and characters outside strings are inside).
-  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
-  // right shift of a signed value expected to be well-defined and standard
-  // compliant as of C++20, John Regher from Utah U. says this is fine code
-  prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
-  // Use ^ to turn the beginning quote off, and the end quote on.
-  return {
-    backslash,
-    escaped,
-    quote,
-    in_string
-  };
-}
+          really_inline void end_object()
+          {
+            log_end_value("object");
+            end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+          }
+          really_inline void end_array()
+          {
+            log_end_value("array");
+            end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+          }
+          really_inline void end_document()
+          {
+            log_end_value("document");
+            end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+          }
 
-really_inline error_code json_string_scanner::finish(bool streaming) {
-  if (prev_in_string and (not streaming)) {
-    return UNCLOSED_STRING;
-  }
-  return SUCCESS;
-}
+          // increment_count increments the count of keys in an object or values in an array.
+          // Note that if you are at the level of the values or elements, the count
+          // must be increment in the preceding depth (depth-1) where the array or
+          // the object resides.
+          really_inline void increment_count()
+          {
+            parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+          }
 
-} // namespace stage1
-/* end file src/generic/json_string_scanner.h */
-/* begin file src/generic/json_scanner.h */
-namespace stage1 {
+          really_inline uint8_t *on_start_string() noexcept
+          {
+            // we advance the point, accounting for the fact that we have a NULL termination
+            tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+            return current_string_buf_loc + sizeof(uint32_t);
+          }
 
-/**
- * A block of scanned json, with information on operators and scalars.
- */
-struct json_block {
-public:
-  /** The start of structurals */
-  really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
-  /** All JSON whitespace (i.e. not in a string) */
-  really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
+          really_inline void on_end_string(uint8_t *dst) noexcept
+          {
+            uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+            // TODO check for overflow in case someone has a crazy string (>=4GB?)
+            // But only add the overflow check when the document itself exceeds 4GB
+            // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+            memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+            // NULL termination is still handy if you expect all your strings to
+            // be NULL terminated? It comes at a small cost
+            *dst = 0;
+            current_string_buf_loc = dst + 1;
+          }
 
-  // Helpers
+          WARN_UNUSED really_inline bool parse_string(bool key = false)
+          {
+            log_value(key ? "key" : "string");
+            uint8_t *dst = on_start_string();
+            dst = stringparsing::parse_string(current(), dst);
+            if (dst == nullptr)
+            {
+              log_error("Invalid escape in string");
+              return true;
+            }
+            on_end_string(dst);
+            return false;
+          }
 
-  /** Whether the given characters are inside a string (only works on non-quotes) */
-  really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
-  /** Whether the given characters are outside a string (only works on non-quotes) */
-  really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
+          WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus)
+          {
+            log_value("number");
+            bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+            if (!succeeded)
+            {
+              log_error("Invalid number");
+            }
+            return !succeeded;
+          }
+          WARN_UNUSED really_inline bool parse_number(bool found_minus)
+          {
+            return parse_number(current(), found_minus);
+          }
 
-  // string and escape characters
-  json_string_block _string;
-  // whitespace, operators, scalars
-  json_character_block _characters;
-  // whether the previous character was a scalar
-  uint64_t _follows_potential_scalar;
-private:
-  // Potential structurals (i.e. disregarding strings)
+          really_inline bool parse_number_with_space_terminated_copy(const bool is_negative)
+          {
+            /**
+    * We need to make a copy to make sure that the string is space terminated.
+    * This is not about padding the input, which should already padded up
+    * to len + SIMDJSON_PADDING. However, we have no control at this stage
+    * on how the padding was done. What if the input string was padded with nulls?
+    * It is quite common for an input string to have an extra null character (C string).
+    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+    * document, but the string "9\0" by itself is fine. So we make a copy and
+    * pad the input with spaces when we know that there is just one input element.
+    * This copy is relatively expensive, but it will almost never be called in
+    * practice unless you are in the strange scenario where you have many JSON
+    * documents made of single atoms.
+    */
+            uint8_t *copy = static_cast<uint8_t *>(malloc(parser.len + SIMDJSON_PADDING));
+            if (copy == nullptr)
+            {
+              return true;
+            }
+            memcpy(copy, buf, parser.len);
+            memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+            size_t idx = *current_structural;
+            bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
+            free(copy);
+            return result;
+          }
+          WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state)
+          {
+            switch (advance_char())
+            {
+            case '"':
+              FAIL_IF(parse_string());
+              return continue_state;
+            case 't':
+              log_value("true");
+              FAIL_IF(!atomparsing::is_valid_true_atom(current()));
+              tape.append(0, internal::tape_type::TRUE_VALUE);
+              return continue_state;
+            case 'f':
+              log_value("false");
+              FAIL_IF(!atomparsing::is_valid_false_atom(current()));
+              tape.append(0, internal::tape_type::FALSE_VALUE);
+              return continue_state;
+            case 'n':
+              log_value("null");
+              FAIL_IF(!atomparsing::is_valid_null_atom(current()));
+              tape.append(0, internal::tape_type::NULL_VALUE);
+              return continue_state;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+              FAIL_IF(parse_number(false));
+              return continue_state;
+            case '-':
+              FAIL_IF(parse_number(true));
+              return continue_state;
+            case '{':
+              FAIL_IF(start_object(continue_state));
+              return addresses.object_begin;
+            case '[':
+              FAIL_IF(start_array(continue_state));
+              return addresses.array_begin;
+            default:
+              log_error("Non-value found when value was expected!");
+              return addresses.error;
+            }
+          }
 
-  /** operators plus scalar starts like 123, true and "abc" */
-  really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
-  /** the start of non-operator runs, like 123, true and "abc" */
-  really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
-  /** whether the given character is immediately after a non-operator like 123, true or " */
-  really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
-};
+          WARN_UNUSED really_inline error_code finish()
+          {
+            end_document();
+            parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
 
-/**
- * Scans JSON for important bits: operators, strings, and scalars.
- *
- * The scanner starts by calculating two distinct things:
- * - string characters (taking \" into account)
- * - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
- *
- * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
- * in particular, the operator/scalar bit will find plenty of things that are actually part of
- * strings. When we're done, json_block will fuse the two together by masking out tokens that are
- * part of a string.
- */
-class json_scanner {
-public:
-  really_inline json_block next(const simd::simd8x64<uint8_t> in);
-  really_inline error_code finish(bool streaming);
+            if (depth != 0)
+            {
+              log_error("Unclosed objects or arrays!");
+              return parser.error = TAPE_ERROR;
+            }
 
-private:
-  // Whether the last character of the previous iteration is part of a scalar token
-  // (anything except whitespace or an operator).
-  uint64_t prev_scalar = 0ULL;
-  json_string_scanner string_scanner;
-};
+            return SUCCESS;
+          }
 
+          WARN_UNUSED really_inline error_code error()
+          {
+            /* We do not need the next line because this is done by parser.init_stage2(),
+    * pessimistically.
+    * parser.is_valid  = false;
+    * At this point in the code, we have all the time in the world.
+    * Note that we know exactly where we are in the document so we could,
+    * without any overhead on the processing code, report a specific
+    * location.
+    * We could even trigger special code paths to assess what happened
+    * carefully,
+    * all without any added cost. */
+            if (depth >= parser.max_depth())
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            switch (current_char())
+            {
+            case '"':
+              return parser.error = STRING_ERROR;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '-':
+              return parser.error = NUMBER_ERROR;
+            case 't':
+              return parser.error = T_ATOM_ERROR;
+            case 'n':
+              return parser.error = N_ATOM_ERROR;
+            case 'f':
+              return parser.error = F_ATOM_ERROR;
+            default:
+              return parser.error = TAPE_ERROR;
+            }
+          }
 
-//
-// Check if the current character immediately follows a matching character.
-//
-// For example, this checks for quotes with backslashes in front of them:
-//
-//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
-//
-really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
-  const uint64_t result = match << 1 | overflow;
-  overflow = match >> 63;
-  return result;
-}
+          really_inline void init()
+          {
+            log_start();
+            parser.error = UNINITIALIZED;
+          }
 
-//
-// Check if the current character follows a matching character, with possible "filler" between.
-// For example, this checks for empty curly braces, e.g. 
-//
-//     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
-//
-really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) {
-  uint64_t follows_match = follows(match, overflow);
-  uint64_t result;
-  overflow |= uint64_t(add_overflow(follows_match, filler, &result));
-  return result;
-}
+          WARN_UNUSED really_inline error_code start(ret_address_t finish_state)
+          {
+            // If there are no structurals left, return EMPTY
+            if (at_end(parser.n_structural_indexes))
+            {
+              return parser.error = EMPTY;
+            }
 
-really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
-  json_string_block strings = string_scanner.next(in);
-  json_character_block characters = json_character_block::classify(in);
-  uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
-  return {
-    strings,
-    characters,
-    follows_scalar
-  };
-}
+            init();
+            // Push the root scope (there is always at least one scope)
+            if (start_document(finish_state))
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            return SUCCESS;
+          }
 
-really_inline error_code json_scanner::finish(bool streaming) {
-  return string_scanner.finish(streaming);
-}
+          really_inline void log_value(const char *type)
+          {
+            logger::log_line(*this, "", type, "");
+          }
 
-} // namespace stage1
-/* end file src/generic/json_scanner.h */
+          static really_inline void log_start()
+          {
+            logger::log_start();
+          }
 
-/* begin file src/generic/json_minifier.h */
-// This file contains the common code every implementation uses in stage1
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is included already includes
-// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+          really_inline void log_start_value(const char *type)
+          {
+            logger::log_line(*this, "+", type, "");
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth++;
+            }
+          }
 
-namespace stage1 {
+          really_inline void log_end_value(const char *type)
+          {
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth--;
+            }
+            logger::log_line(*this, "-", type, "");
+          }
 
-class json_minifier {
-public:
-  template<size_t STEP_SIZE>
-  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+          really_inline void log_error(const char *error)
+          {
+            logger::log_line(*this, "", "ERROR", error);
+          }
+        }; // struct structural_parser
 
-private:
-  really_inline json_minifier(uint8_t *_dst) : dst{_dst} {}
-  template<size_t STEP_SIZE>
-  really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
-  really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
-  really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
-  json_scanner scanner;
-  uint8_t *dst;
-};
+// Redefine FAIL_IF to use goto since it'll be used inside the function now
+#undef FAIL_IF
+#define FAIL_IF(EXPR) \
+  {                   \
+    if (EXPR)         \
+    {                 \
+      goto error;     \
+    }                 \
+  }
 
-really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
-  uint64_t mask = block.whitespace();
-  in.compress(mask, dst);
-  dst += 64 - count_ones(mask);
-}
+        template <bool STREAMING>
+        WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept
+        {
+          dom_parser.doc = &doc;
+          static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+          stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+          error_code result = parser.start(addresses.finish);
+          if (result)
+          {
+            return result;
+          }
 
-really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
-  *dst = '\0';
-  error_code error = scanner.finish(false);
-  if (error) { dst_len = 0; return error; }
-  dst_len = dst - dst_start;
-  return SUCCESS;
-}
+          //
+          // Read first value
+          //
+          switch (parser.current_char())
+          {
+          case '{':
+            FAIL_IF(parser.start_object(addresses.finish));
+            goto object_begin;
+          case '[':
+            FAIL_IF(parser.start_array(addresses.finish));
+            // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+            // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+            if (!STREAMING)
+            {
+              if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']')
+              {
+                goto error;
+              }
+            }
+            goto array_begin;
+          case '"':
+            FAIL_IF(parser.parse_string());
+            goto finish;
+          case 't':
+            parser.log_value("true");
+            FAIL_IF(!atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+            goto finish;
+          case 'f':
+            parser.log_value("false");
+            FAIL_IF(!atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+            goto finish;
+          case 'n':
+            parser.log_value("null");
+            FAIL_IF(!atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::NULL_VALUE);
+            goto finish;
+          case '0':
+          case '1':
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(false))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          case '-':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(true))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          default:
+            parser.log_error("Document starts with a non-value character");
+            goto error;
+          }
 
-template<>
-really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block_buf);
-  simd::simd8x64<uint8_t> in_2(block_buf+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1);
-  this->next(in_2, block_2);
-  reader.advance();
-}
+        //
+        // Object parser states
+        //
+        object_begin:
+          switch (parser.advance_char())
+          {
+          case '"':
+          {
+            parser.increment_count();
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          }
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("Object does not start with a key");
+            goto error;
+          }
 
-template<>
-really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block_buf);
-  json_block block_1 = scanner.next(in_1);
-  this->next(block_buf, block_1);
-  reader.advance();
-}
+        object_key_state:
+          if (parser.advance_char() != ':')
+          {
+            parser.log_error("Missing colon after key in object");
+            goto error;
+          }
+          GOTO(parser.parse_value(addresses, addresses.object_continue));
 
-template<size_t STEP_SIZE>
-error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_minifier minifier(dst);
-  while (reader.has_full_block()) {
-    minifier.step<STEP_SIZE>(reader.full_block(), reader);
-  }
+        object_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            if (parser.advance_char() != '"')
+            {
+              parser.log_error("Key string missing at beginning of field in object");
+              goto error;
+            }
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("No comma between object fields");
+            goto error;
+          }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    minifier.step<STEP_SIZE>(block, reader);
-  }
+        scope_end:
+          CONTINUE(parser.parser.ret_address[parser.depth]);
 
-  return minifier.finish(dst, dst_len);
-}
+        //
+        // Array parser states
+        //
+        array_begin:
+          if (parser.peek_next_char() == ']')
+          {
+            parser.advance_char();
+            parser.end_array();
+            goto scope_end;
+          }
+          parser.increment_count();
 
-} // namespace stage1
-/* end file src/generic/json_minifier.h */
-WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
-  return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
-}
+        main_array_switch:
+          /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+          GOTO(parser.parse_value(addresses, addresses.array_continue));
 
-/* begin file src/generic/utf8_lookup2_algorithm.h */
-//
-// Detect Unicode errors.
-//
-// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
-// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
-// are straight up concatenated into the final value. The first byte of a multibyte character is a
-// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
-// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
-// start with 0, because that's what ASCII looks like. Here's what each size looks like:
-//
-// - ASCII (7 bits):              0_______
-// - 2 byte character (11 bits):  110_____ 10______
-// - 3 byte character (17 bits):  1110____ 10______ 10______
-// - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
-// - 5+ byte character (illegal): 11111___ <illegal>
-//
-// There are 5 classes of error that can happen in Unicode:
-//
-// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
-//   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
-//   character.
-//
-//   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
-//
-// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
-//   We detect this by requiring that the next byte after your multibyte character be a new
-//   character--so a continuation after your character is wrong.
-//
-//   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
-//
-// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
-//
-//   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
-//
-// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
-//   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
-//   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
-//
-//   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
-//
-// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
-//   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
-//
-//   e.g. 11101101 10100000 10000000 (U+D800)
-//
-// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
-//   support values with more than 23 bits (which a 4-byte character supports).
-//
-//   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//   
-// Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-// 
-//   Code Points        1st       2s       3s       4s
-//  U+0000..U+007F     00..7F
-//  U+0080..U+07FF     C2..DF   80..BF
-//  U+0800..U+0FFF     E0       A0..BF   80..BF
-//  U+1000..U+CFFF     E1..EC   80..BF   80..BF
-//  U+D000..U+D7FF     ED       80..9F   80..BF
-//  U+E000..U+FFFF     EE..EF   80..BF   80..BF
-//  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
-//  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
-//  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
-//
-using namespace simd;
+        array_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            goto main_array_switch;
+          case ']':
+            parser.end_array();
+            goto scope_end;
+          default:
+            parser.log_error("Missing comma between array values");
+            goto error;
+          }
 
-namespace utf8_validation {
+        finish:
+          return parser.finish();
 
-  //
-  // Find special case UTF-8 errors where the character is technically readable (has the right length)
-  // but the *value* is disallowed.
-  //
-  // This includes overlong encodings, surrogates and values too large for Unicode.
-  //
-  // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
-  // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
-  // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
-  // If all 3 lookups detect the same error, it's an error.
-  //
-  really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-    //
-    // These are the errors we're going to match for bytes 1-2, by looking at the first three
-    // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
-    //
-    static const int OVERLONG_2  = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
-    static const int OVERLONG_3  = 0x02; // 11100000 100_____ ________
-    static const int OVERLONG_4  = 0x04; // 11110000 1000____ ________ ________
-    static const int SURROGATE   = 0x08; // 11101101 [101_]____
-    static const int TOO_LARGE   = 0x10; // 11110100 (1001|101_)____
-    static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
+        error:
+          return parser.error();
+        }
 
-    // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
-    // byte 2 to be sure which things are errors and which aren't.
-    // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
-    static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
-    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ASCII: ________ [0___]____
-        CARRY, CARRY, CARRY, CARRY,
-        // ASCII: ________ [0___]____
-        CARRY, CARRY, CARRY, CARRY,
-        // Continuations: ________ [10__]____
-        CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
-        CARRY | OVERLONG_3 | TOO_LARGE,  // ________ [1001]____
-        CARRY | TOO_LARGE  | SURROGATE,  // ________ [1010]____
-        CARRY | TOO_LARGE  | SURROGATE,  // ________ [1011]____
-        // Multibyte Leads: ________ [11__]____
-        CARRY, CARRY, CARRY, CARRY
-    );
+      } // namespace
+    }   // namespace stage2
 
-    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // [0___]____ (ASCII)
-      0, 0, 0, 0,                          
-      0, 0, 0, 0,
-      // [10__]____ (continuation)
-      0, 0, 0, 0,
-      // [11__]____ (2+-byte leads)
-      OVERLONG_2, 0,                       // [110_]____ (2-byte lead)
-      OVERLONG_3 | SURROGATE,              // [1110]____ (3-byte lead)
-      OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead)
-    );
+    /************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+    WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept
+    {
+      error_code result = stage2::parse_structurals<false>(*this, _doc);
+      if (result)
+      {
+        return result;
+      }
 
-    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____[00__] ________
-      OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
-      OVERLONG_2,                           // ____[0001] ________
-      0, 0,
-      // ____[01__] ________
-      TOO_LARGE,                            // ____[0100] ________
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      // ____[10__] ________
-      TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
-      // ____[11__] ________
-      TOO_LARGE_2,
-      TOO_LARGE_2 | SURROGATE,                            // ____[1101] ________
-      TOO_LARGE_2, TOO_LARGE_2
-    );
+      // If we didn't make it to the end, it's an error
+      if (next_structural_index != n_structural_indexes)
+      {
+        logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+        return error = TAPE_ERROR;
+      }
 
-    return byte_1_high & byte_1_low & byte_2_high;
-  }
+      return SUCCESS;
+    }
 
-  //
-  // Validate the length of multibyte characters (that each multibyte character has the right number
-  // of continuation characters, and that all continuation characters are part of a multibyte
-  // character).
-  //
-  // Algorithm
-  // =========
-  //
-  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
-  // and emits an error anytime there is a mismatch.
-  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
-  // characters, the file will look like this:
-  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
-  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
-  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
-  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
-  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
-  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
-  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
-  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  //
-  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
-  //
-  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
-  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
-  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
-  //   or maybe it's at the beginning of the file before any characters have started; but it's an
-  //   error in all these cases.
-  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
-  //   we started a new character before we were finished with the current one.
-  //
-  // Getting the Previous Bytes
-  // --------------------------
-  //
-  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
-  // character, we need to "shift the bytes" to find that out. This is what they mean:
-  //
-  // - `is_continuation`: if the current byte is a continuation.
-  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
-  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
-  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
-  //
-  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
-  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
-  // function, because the 1-byte-back data is used by other checks as well.
-  //
-  // Getting the Continuation Mask
-  // -----------------------------
-  //
-  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
-  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
-  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
-  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
-  //
-  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
-  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
-  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
-  //
-  // When treated as signed numbers, they look like this:
-  //
-  // | Type         | High Bits  | Binary Range | Signed |
-  // |--------------|------------|--------------|--------|
-  // | ASCII        | `0`        | `01111111`   |   127  |
-  // |              |            | `00000000`   |     0  |
-  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
-  // |              |            | `11110000    |   -16  |
-  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
-  // |              |            | `11100000    |   -32  |
-  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
-  // |              |            | `11000000    |   -64  |
-  // | Continuation | `10`       | `10111111`   |   -65  |
-  // |              |            | `10000000    |  -128  |
-  //
-  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
-  //
-  // ```
-  // is_continuation = input < -64`
-  // ```
-  //
-  // We can do something similar for the others, but it takes two comparisons instead of one: "is
-  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
-  // `> -64`. Surely we can do better, they're right next to each other!
-  //
-  // Getting the is_xxx Masks: Shifting the Range
-  // --------------------------------------------
-  //
-  // Notice *why* continuations were a single comparison. The actual *range* would require two
-  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
-  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
-  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
-  //
-  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
-  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
-  //
-  // | Type                 | High Bits  | Binary Range | Signed |
-  // |----------------------|------------|--------------|-------|
-  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
-  // |                      |            | `01110000    |   112 |
-  // |----------------------|------------|--------------|-------|
-  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
-  // |                      |            | `01100000    |    96 |
-  // |----------------------|------------|--------------|-------|
-  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
-  // |                      |            | `01000000    |    64 |
-  // |----------------------|------------|--------------|-------|
-  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
-  // |                      |            | `00000000    |     0 |
-  // |----------------------|------------|--------------|-------|
-  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
-  // |                      |            | `10000000`   |  -128 |
-  // |----------------------|------------|--------------|-------|
-  // 
-  // *Now* we can use signed `>` on all of them:
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev2 = input.prev<2>
-  // prev3 = input.prev<3>
-  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
-  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
-  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
-  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
-  // ```
-  //
-  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
-  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
-  // `^`'s at a time on Haswell, but only 2 `+`'s).
-  //
-  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
-  // same number to all of them, we can save one of those `+ 128` operations by assembling
-  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
-  // to it. One more instruction saved!
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev3 = input.prev<3>
-  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
-  // ```
-  //
-  // ### Bringing It All Together: Detecting the Errors
-  //
-  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
-  // All we have left to do is check if they match!
-  //
-  // ```
-  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
-  // ```
-  //
-  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
-  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
-  // bitwise operations, and we're only using 1!
-  //
-  // Epilogue: Addition For Booleans
-  // -------------------------------
-  //
-  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
-  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
-  // to each other (or any combination), and the continuation could be part of either of them!
-  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
-  //
-  // Never fear, though. If that situation occurs, we'll already have detected that the second
-  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
-  // character, but it *wasn't a continuation*.
-  //
-  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
-  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
-  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
-  // comparisons were giving us numbers!
-  //
-  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
-  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
-  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
-  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
-  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
-  //
-  // Further, if *more than one* multibyte character overlaps,
-  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
-  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
-  // considered an error.
-  //
-  // One reason you might want to do this is parallelism. ^ and | are not associative, so
-  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
-  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
-  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
-  // then adds the result together. Same number of operations, but if the processor can run
-  // independent things in parallel (which most can), it runs faster.
-  //
-  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
-  // a super nice advantage in that more of them can be run at the same time (they can run on 3
-  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
-  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
-  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
-  // out of the way," running on a port other instructions can't.
-  // 
-  // Epilogue II: One More Trick
-  // ---------------------------
-  //
-  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
-  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
-  // check_special_cases()--but we'll talk about that there :)
-  //
-  really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
-    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    /************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+    WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept
+    {
+      return stage2::parse_structurals<true>(*this, _doc);
+    }
+    /* end file src/generic/stage2/tape_writer.h */
 
-    // Cont is 10000000-101111111 (-65...-128)
-    simd8<bool> is_continuation = simd8<int8_t>(input) < int8_t(-64);
-    // must_be_continuation is architecture-specific because Intel doesn't have unsigned comparisons
-    return simd8<uint8_t>(must_be_continuation(prev1, prev2, prev3) ^ is_continuation);
-  }
+    WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept
+    {
+      error_code err = stage1(_buf, _len, false);
+      if (err)
+      {
+        return err;
+      }
+      return stage2(_doc);
+    }
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
-    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
-    // ... 1111____ 111_____ 11______
-    static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+  } // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+#endif
+#if SIMDJSON_IMPLEMENTATION_FALLBACK
+/* begin file src/fallback/implementation.cpp */
+/* fallback/implementation.h already included: #include "fallback/implementation.h" */
+/* begin file src/fallback/dom_parser_implementation.h */
+#ifndef SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
+
+/* isadetection.h already included: #include "isadetection.h" */
+
+namespace simdjson
+{
+  namespace fallback
+  {
+
+    /* begin file src/generic/dom_parser_implementation.h */
+    // expectation: sizeof(scope_descriptor) = 64/8.
+    struct scope_descriptor
+    {
+      uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+      uint32_t count;      // how many elements in the scope
+    };                     // struct scope_descriptor
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+    typedef void *ret_address_t;
+#else
+    typedef char ret_address_t;
+#endif
+
+    class dom_parser_implementation final : public internal::dom_parser_implementation
+    {
+    public:
+      /** Tape location of each open { or [ */
+      std::unique_ptr<scope_descriptor[]> containing_scope{};
+      /** Return address of each open { or [ */
+      std::unique_ptr<ret_address_t[]> ret_address{};
+      /** Buffer passed to stage 1 */
+      const uint8_t *buf{};
+      /** Length passed to stage 1 */
+      size_t len{0};
+      /** Document passed to stage 2 */
+      dom::document *doc{};
+      /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+      error_code error{UNINITIALIZED};
+
+      really_inline dom_parser_implementation();
+      dom_parser_implementation(const dom_parser_implementation &) = delete;
+      dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+      WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+      WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+      WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+      WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
-    return input.gt_bits(max_value);
-  }
 
-  struct utf8_checker {
-    // If this is nonzero, there has been a UTF-8 error.
-    simd8<uint8_t> error;
-    // The last input we received
-    simd8<uint8_t> prev_input_block;
-    // Whether the last input we received was incomplete (used for ASCII fast path)
-    simd8<uint8_t> prev_incomplete;
+    /* begin file src/generic/stage1/allocate.h */
+    namespace stage1
+    {
+      namespace allocate
+      {
 
-    //
-    // Check whether the current bytes are valid UTF-8.
-    //
-    really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      this->error |= check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, prev1);
-    }
+        //
+        // Allocates stage 1 internal state and outputs in the parser
+        //
+        really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity)
+        {
+          size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+          parser.structural_indexes.reset(new (std::nothrow) uint32_t[max_structures]);
+          if (!parser.structural_indexes)
+          {
+            return MEMALLOC;
+          }
+          parser.structural_indexes[0] = 0;
+          parser.n_structural_indexes = 0;
+          return SUCCESS;
+        }
 
-    // The only problem that can happen at EOF is that a multibyte character is too short.
-    really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
-    }
+      } // namespace allocate
+    }   // namespace stage1
+    /* end file src/generic/stage1/allocate.h */
+    /* begin file src/generic/stage2/allocate.h */
+    namespace stage2
+    {
+      namespace allocate
+      {
 
-    really_inline void check_next_input(simd8x64<uint8_t> input) {
-      if (likely(is_ascii(input))) {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error |= this->prev_incomplete;
-      } else {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
-          this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
+        //
+        // Allocates stage 2 internal state and outputs in the parser
+        //
+        really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth)
+        {
+          parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+          parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
+
+          if (!parser.ret_address || !parser.containing_scope)
+          {
+            return MEMALLOC;
+          }
+          return SUCCESS;
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      } // namespace allocate
+    }   // namespace stage2
+    /* end file src/generic/stage2/allocate.h */
+
+    really_inline dom_parser_implementation::dom_parser_implementation() {}
+
+    // Leaving these here so they can be inlined if so desired
+    WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept
+    {
+      error_code err = stage1::allocate::set_capacity(*this, capacity);
+      if (err)
+      {
+        _capacity = 0;
+        return err;
       }
+      _capacity = capacity;
+      return SUCCESS;
     }
 
-    really_inline error_code errors() {
-      return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+    WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept
+    {
+      error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+      if (err)
+      {
+        _max_depth = 0;
+        return err;
+      }
+      _max_depth = max_depth;
+      return SUCCESS;
     }
+    /* end file src/generic/stage2/allocate.h */
 
-  }; // struct utf8_checker
-}
+  } // namespace fallback
+} // namespace simdjson
 
-using utf8_validation::utf8_checker;
-/* end file src/generic/utf8_lookup2_algorithm.h */
-/* begin file src/generic/json_structural_indexer.h */
-// This file contains the common code every implementation uses in stage1
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is included already includes
-// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+#endif // SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
 
-namespace stage1 {
+TARGET_HASWELL
 
-class bit_indexer {
-public:
-  uint32_t *tail;
+namespace simdjson
+{
+  namespace fallback
+  {
 
-  really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+    WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+        size_t capacity,
+        size_t max_depth,
+        std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept
+    {
+      dst.reset(new (std::nothrow) dom_parser_implementation());
+      if (!dst)
+      {
+        return MEMALLOC;
+      }
+      dst->set_capacity(capacity);
+      dst->set_max_depth(max_depth);
+      return SUCCESS;
+    }
 
-  // flatten out values in 'bits' assuming that they are are to have values of idx
-  // plus their position in the bitvector, and store these indexes at
-  // base_ptr[base] incrementing base as we go
-  // will potentially store extra values beyond end of valid bits, so base_ptr
-  // needs to be large enough to handle this
-  really_inline void write(uint32_t idx, uint64_t bits) {
-    // In some instances, the next branch is expensive because it is mispredicted.
-    // Unfortunately, in other cases,
-    // it helps tremendously.
-    if (bits == 0)
-        return;
-    uint32_t cnt = count_ones(bits);
+  } // namespace fallback
+} // namespace simdjson
 
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/fallback/dom_parser_implementation.cpp */
+/* fallback/implementation.h already included: #include "fallback/implementation.h" */
+/* fallback/dom_parser_implementation.h already included: #include "fallback/dom_parser_implementation.h" */
 
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
+//
+// Stage 1
+//
+namespace simdjson
+{
+  namespace fallback
+  {
+    namespace stage1
+    {
+
+      /* begin file src/generic/stage1/find_next_document_index.h */
+      /**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+      really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser)
+      {
+        // TODO don't count separately, just figure out depth
+        auto arr_cnt = 0;
+        auto obj_cnt = 0;
+        for (auto i = parser.n_structural_indexes - 1; i > 0; i--)
+        {
+          auto idxb = parser.structural_indexes[i];
+          switch (parser.buf[idxb])
+          {
+          case ':':
+          case ',':
+            continue;
+          case '}':
+            obj_cnt--;
+            continue;
+          case ']':
+            arr_cnt--;
+            continue;
+          case '{':
+            obj_cnt++;
+            break;
+          case '[':
+            arr_cnt++;
+            break;
+          }
+          auto idxa = parser.structural_indexes[i - 1];
+          switch (parser.buf[idxa])
+          {
+          case '{':
+          case '[':
+          case ':':
+          case ',':
+            continue;
+          }
+          // Last document is complete, so the next document will appear after!
+          if (!arr_cnt && !obj_cnt)
+          {
+            return parser.n_structural_indexes;
+          }
+          // Last document is incomplete; mark the document at i + 1 as the next one
+          return i;
+        }
+        return 0;
       }
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (unlikely(cnt > 16)) {
-        uint32_t i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
+      // Skip the last character if it is partial
+      really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len)
+      {
+        if (unlikely(len < 3))
+        {
+          switch (len)
+          {
+          case 2:
+            if (buf[len - 1] >= 0b11000000)
+            {
+              return len - 1;
+            } // 2-, 3- and 4-byte characters with only 1 byte left
+            if (buf[len - 2] >= 0b11100000)
+            {
+              return len - 2;
+            } // 3- and 4-byte characters with only 2 bytes left
+            return len;
+          case 1:
+            if (buf[len - 1] >= 0b11000000)
+            {
+              return len - 1;
+            } // 2-, 3- and 4-byte characters with only 1 byte left
+            return len;
+          case 0:
+            return len;
+          }
+        }
+        if (buf[len - 1] >= 0b11000000)
+        {
+          return len - 1;
+        } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len - 2] >= 0b11100000)
+        {
+          return len - 2;
+        } // 3- and 4-byte characters with only 1 byte left
+        if (buf[len - 3] >= 0b11110000)
+        {
+          return len - 3;
+        } // 4-byte characters with only 3 bytes left
+        return len;
       }
-    }
+      /* end file src/generic/stage1/find_next_document_index.h */
 
-    this->tail += cnt;
-  }
-};
+      class structural_scanner
+      {
+      public:
+        really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
+            : buf{_parser.buf},
+              next_structural_index{_parser.structural_indexes.get()},
+              parser{_parser},
+              len{static_cast<uint32_t>(_parser.len)},
+              partial{_partial}
+        {
+        }
 
-class json_structural_indexer {
-public:
-  template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
+        really_inline void add_structural()
+        {
+          *next_structural_index = idx;
+          next_structural_index++;
+        }
 
-private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
-  template<size_t STEP_SIZE>
-  really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
-  really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
+        really_inline bool is_continuation(uint8_t c)
+        {
+          return (c & 0b11000000) == 0b10000000;
+        }
 
-  json_scanner scanner;
-  utf8_checker checker{};
-  bit_indexer indexer;
-  uint64_t prev_structurals = 0;
-  uint64_t unescaped_chars_error = 0;
-};
+        really_inline void validate_utf8_character()
+        {
+          // Continuation
+          if (unlikely((buf[idx] & 0b01000000) == 0))
+          {
+            // extra continuation
+            error = UTF8_ERROR;
+            idx++;
+            return;
+          }
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(idx-64, prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
+          // 2-byte
+          if ((buf[idx] & 0b00100000) == 0)
+          {
+            // missing continuation
+            if (unlikely(idx + 1 > len || !is_continuation(buf[idx + 1])))
+            {
+              if (idx + 1 > len && partial)
+              {
+                idx = len;
+                return;
+              }
+              error = UTF8_ERROR;
+              idx++;
+              return;
+            }
+            // overlong: 1100000_ 10______
+            if (buf[idx] <= 0b11000001)
+            {
+              error = UTF8_ERROR;
+            }
+            idx += 2;
+            return;
+          }
 
-really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
-  // Write out the final iteration's structurals
-  indexer.write(idx-64, prev_structurals);
+          // 3-byte
+          if ((buf[idx] & 0b00010000) == 0)
+          {
+            // missing continuation
+            if (unlikely(idx + 2 > len || !is_continuation(buf[idx + 1]) || !is_continuation(buf[idx + 2])))
+            {
+              if (idx + 2 > len && partial)
+              {
+                idx = len;
+                return;
+              }
+              error = UTF8_ERROR;
+              idx++;
+              return;
+            }
+            // overlong: 11100000 100_____ ________
+            if (buf[idx] == 0b11100000 && buf[idx + 1] <= 0b10011111)
+            {
+              error = UTF8_ERROR;
+            }
+            // surrogates: U+D800-U+DFFF 11101101 101_____
+            if (buf[idx] == 0b11101101 && buf[idx + 1] >= 0b10100000)
+            {
+              error = UTF8_ERROR;
+            }
+            idx += 3;
+            return;
+          }
 
-  error_code error = scanner.finish(streaming);
-  if (unlikely(error != SUCCESS)) { return error; }
+          // 4-byte
+          // missing continuation
+          if (unlikely(idx + 3 > len || !is_continuation(buf[idx + 1]) || !is_continuation(buf[idx + 2]) || !is_continuation(buf[idx + 3])))
+          {
+            if (idx + 2 > len && partial)
+            {
+              idx = len;
+              return;
+            }
+            error = UTF8_ERROR;
+            idx++;
+            return;
+          }
+          // overlong: 11110000 1000____ ________ ________
+          if (buf[idx] == 0b11110000 && buf[idx + 1] <= 0b10001111)
+          {
+            error = UTF8_ERROR;
+          }
+          // too large: > U+10FFFF:
+          // 11110100 (1001|101_)____
+          // 1111(1___|011_|0101) 10______
+          // also includes 5, 6, 7 and 8 byte characters:
+          // 11111___
+          if (buf[idx] == 0b11110100 && buf[idx + 1] >= 0b10010000)
+          {
+            error = UTF8_ERROR;
+          }
+          if (buf[idx] >= 0b11110101)
+          {
+            error = UTF8_ERROR;
+          }
+          idx += 4;
+        }
 
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
-  }
+        really_inline void validate_string()
+        {
+          idx++; // skip first quote
+          while (idx < len && buf[idx] != '"')
+          {
+            if (buf[idx] == '\\')
+            {
+              idx += 2;
+            }
+            else if (unlikely(buf[idx] & 0b10000000))
+            {
+              validate_utf8_character();
+            }
+            else
+            {
+              if (buf[idx] < 0x20)
+              {
+                error = UNESCAPED_CHARS;
+              }
+              idx++;
+            }
+          }
+          if (idx >= len && !partial)
+          {
+            error = UNCLOSED_STRING;
+          }
+        }
 
-  parser.n_structural_indexes = indexer.tail - parser.structural_indexes.get();
-  /* a valid JSON file cannot have zero structural indexes - we should have
-   * found something */
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
-    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending character. */
-    parser.structural_indexes[parser.n_structural_indexes++] = len;
-  }
-  /* make it safe to dereference one beyond this array */
-  parser.structural_indexes[parser.n_structural_indexes] = 0;
-  return checker.errors();
-}
+        really_inline bool is_whitespace_or_operator(uint8_t c)
+        {
+          switch (c)
+          {
+          case '{':
+          case '}':
+          case '[':
+          case ']':
+          case ',':
+          case ':':
+          case ' ':
+          case '\r':
+          case '\n':
+          case '\t':
+            return true;
+          default:
+            return false;
+          }
+        }
 
-template<>
-really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  simd::simd8x64<uint8_t> in_2(block+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1, reader.block_index());
-  this->next(in_2, block_2, reader.block_index()+64);
-  reader.advance();
-}
+        //
+        // Parse the entire input in STEP_SIZE-byte chunks.
+        //
+        really_inline error_code scan()
+        {
+          for (; idx < len; idx++)
+          {
+            switch (buf[idx])
+            {
+            // String
+            case '"':
+              add_structural();
+              validate_string();
+              break;
+            // Operator
+            case '{':
+            case '}':
+            case '[':
+            case ']':
+            case ',':
+            case ':':
+              add_structural();
+              break;
+            // Whitespace
+            case ' ':
+            case '\r':
+            case '\n':
+            case '\t':
+              break;
+            // Primitive or invalid character (invalid characters will be checked in stage 2)
+            default:
+              // Anything else, add the structural and go until we find the next one
+              add_structural();
+              while (idx + 1 < len && !is_whitespace_or_operator(buf[idx + 1]))
+              {
+                idx++;
+              };
+              break;
+            }
+          }
+          *next_structural_index = len;
+          // We pad beyond.
+          // https://github.com/simdjson/simdjson/issues/906
+          next_structural_index[1] = len;
+          next_structural_index[2] = 0;
+          parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
+          parser.next_structural_index = 0;
 
-template<>
-really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  json_block block_1 = scanner.next(in_1);
-  this->next(in_1, block_1, reader.block_index());
-  reader.advance();
-}
+          if (unlikely(parser.n_structural_indexes == 0))
+          {
+            return EMPTY;
+          }
 
-//
-// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-// 
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
-// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
-// you may want to call on a function like trimmed_length_safe_utf8.
-template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
-  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+          if (partial)
+          {
+            auto new_structural_indexes = find_next_document_index(parser);
+            if (new_structural_indexes == 0 && parser.n_structural_indexes > 0)
+            {
+              return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+            }
+            parser.n_structural_indexes = new_structural_indexes;
+          }
 
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_structural_indexer indexer(parser.structural_indexes.get());
-  while (reader.has_full_block()) {
-    indexer.step<STEP_SIZE>(reader.full_block(), reader);
-  }
+          return error;
+        }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    indexer.step<STEP_SIZE>(block, reader);
-  }
+      private:
+        const uint8_t *buf;
+        uint32_t *next_structural_index;
+        dom_parser_implementation &parser;
+        uint32_t len;
+        uint32_t idx{0};
+        error_code error{SUCCESS};
+        bool partial;
+      }; // structural_scanner
 
-  return indexer.finish(parser, reader.block_index(), len, streaming);
-}
+    } // namespace stage1
 
-} // namespace stage1
-/* end file src/generic/json_structural_indexer.h */
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming);
-}
+    WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept
+    {
+      this->buf = _buf;
+      this->len = _len;
+      stage1::structural_scanner scanner(*this, partial);
+      return scanner.scan();
+    }
 
-} // namespace simdjson::haswell
-UNTARGET_REGION
+    // big table for the minifier
+    static uint8_t jump_table[256 * 3] = {
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        0,
+        0,
+        1,
+        0,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        0,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        0,
+        0,
+        1,
+        1,
+        1,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        0,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+        0,
+        1,
+        1,
+    };
 
-#endif // SIMDJSON_HASWELL_STAGE1_FIND_MARKS_H
-/* end file src/generic/json_structural_indexer.h */
-#endif
-#if SIMDJSON_IMPLEMENTATION_WESTMERE
-/* begin file src/westmere/stage1_find_marks.h */
-#ifndef SIMDJSON_WESTMERE_STAGE1_FIND_MARKS_H
-#define SIMDJSON_WESTMERE_STAGE1_FIND_MARKS_H
+    WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept
+    {
+      size_t i = 0, pos = 0;
+      uint8_t quote = 0;
+      uint8_t nonescape = 1;
 
-/* begin file src/westmere/bitmask.h */
-#ifndef SIMDJSON_WESTMERE_BITMASK_H
-#define SIMDJSON_WESTMERE_BITMASK_H
+      while (i < len)
+      {
+        unsigned char c = buf[i];
+        uint8_t *meta = jump_table + 3 * c;
 
-/* begin file src/westmere/intrinsics.h */
-#ifndef SIMDJSON_WESTMERE_INTRINSICS_H
-#define SIMDJSON_WESTMERE_INTRINSICS_H
+        quote = quote ^ (meta[0] & nonescape);
+        dst[pos] = c;
+        pos += meta[2] | quote;
 
-#ifdef _MSC_VER
-#include <intrin.h> // visual studio
-#else
-#include <x86intrin.h> // elsewhere
-#endif // _MSC_VER
+        i += 1;
+        nonescape = uint8_t(~nonescape) | (meta[1]);
+      }
+      dst_len = pos; // we intentionally do not work with a reference
+      // for fear of aliasing
+      return SUCCESS;
+    }
 
-#endif // SIMDJSON_WESTMERE_INTRINSICS_H
-/* end file src/westmere/intrinsics.h */
+    // credit: based on code from Google Fuchsia (Apache Licensed)
+    WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept
+    {
+      const uint8_t *data = (const uint8_t *)buf;
+      uint64_t pos = 0;
+      uint64_t next_pos = 0;
+      uint32_t code_point = 0;
+      while (pos < len)
+      {
+        // check of the next 8 bytes are ascii.
+        next_pos = pos + 16;
+        if (next_pos <= len)
+        { // if it is safe to read 8 more bytes, check that they are ascii
+          uint64_t v1;
+          memcpy(&v1, data + pos, sizeof(uint64_t));
+          uint64_t v2;
+          memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+          uint64_t v{v1 | v2};
+          if ((v & 0x8080808080808080) == 0)
+          {
+            pos = next_pos;
+            continue;
+          }
+        }
+        unsigned char byte = data[pos];
+        if (byte < 0b10000000)
+        {
+          pos++;
+          continue;
+        }
+        else if ((byte & 0b11100000) == 0b11000000)
+        {
+          next_pos = pos + 2;
+          if (next_pos > len)
+          {
+            return false;
+          }
+          if ((data[pos + 1] & 0b11000000) != 0b10000000)
+          {
+            return false;
+          }
+          // range check
+          code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+          if (code_point < 0x80 || 0x7ff < code_point)
+          {
+            return false;
+          }
+        }
+        else if ((byte & 0b11110000) == 0b11100000)
+        {
+          next_pos = pos + 3;
+          if (next_pos > len)
+          {
+            return false;
+          }
+          if ((data[pos + 1] & 0b11000000) != 0b10000000)
+          {
+            return false;
+          }
+          if ((data[pos + 2] & 0b11000000) != 0b10000000)
+          {
+            return false;
+          }
+          // range check
+          code_point = (byte & 0b00001111) << 12 |
+                       (data[pos + 1] & 0b00111111) << 6 |
+                       (data[pos + 2] & 0b00111111);
+          if (code_point < 0x800 || 0xffff < code_point ||
+              (0xd7ff < code_point && code_point < 0xe000))
+          {
+            return false;
+          }
+        }
+        else if ((byte & 0b11111000) == 0b11110000)
+        { // 0b11110000
+          next_pos = pos + 4;
+          if (next_pos > len)
+          {
+            return false;
+          }
+          if ((data[pos + 1] & 0b11000000) != 0b10000000)
+          {
+            return false;
+          }
+          if ((data[pos + 2] & 0b11000000) != 0b10000000)
+          {
+            return false;
+          }
+          if ((data[pos + 3] & 0b11000000) != 0b10000000)
+          {
+            return false;
+          }
+          // range check
+          code_point =
+              (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+              (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+          if (code_point < 0xffff || 0x10ffff < code_point)
+          {
+            return false;
+          }
+        }
+        else
+        {
+          // we may have a continuation
+          return false;
+        }
+        pos = next_pos;
+      }
+      return true;
+    }
 
-TARGET_WESTMERE
-namespace simdjson::westmere {
+  } // namespace fallback
+} // namespace simdjson
 
 //
-// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+// Stage 2
 //
-// For example, prefix_xor(00100100) == 00011100
-//
-really_inline uint64_t prefix_xor(const uint64_t bitmask) {
-  // There should be no such thing with a processing supporting avx2
-  // but not clmul.
-  __m128i all_ones = _mm_set1_epi8('\xFF');
-  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
-  return _mm_cvtsi128_si64(result);
-}
+/* begin file src/fallback/stringparsing.h */
+#ifndef SIMDJSON_FALLBACK_STRINGPARSING_H
+#define SIMDJSON_FALLBACK_STRINGPARSING_H
 
-} // namespace simdjson::westmere
-UNTARGET_REGION
+/* jsoncharutils.h already included: #include "jsoncharutils.h" */
 
-#endif // SIMDJSON_WESTMERE_BITMASK_H
-/* end file src/westmere/intrinsics.h */
-/* begin file src/westmere/simd.h */
-#ifndef SIMDJSON_WESTMERE_SIMD_H
-#define SIMDJSON_WESTMERE_SIMD_H
+namespace simdjson
+{
+  namespace fallback
+  {
 
-/* simdprune_tables.h already included: #include "simdprune_tables.h" */
-/* begin file src/westmere/bitmanipulation.h */
-#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H
-#define SIMDJSON_WESTMERE_BITMANIPULATION_H
+    // Holds backslashes and quotes locations.
+    struct backslash_and_quote
+    {
+    public:
+      static constexpr uint32_t BYTES_PROCESSED = 1;
+      really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
 
-/* westmere/intrinsics.h already included: #include "westmere/intrinsics.h" */
+      really_inline bool has_quote_first() { return c == '"'; }
+      really_inline bool has_backslash() { return c == '\\'; }
+      really_inline int quote_index() { return c == '"' ? 0 : 1; }
+      really_inline int backslash_index() { return c == '\\' ? 0 : 1; }
 
-TARGET_WESTMERE
-namespace simdjson::westmere {
+      uint8_t c;
+    }; // struct backslash_and_quote
 
-#ifndef _MSC_VER
-// We sometimes call trailing_zero on inputs that are zero,
-// but the algorithms do not end up using the returned value.
-// Sadly, sanitizers are not smart enough to figure it out.
-__attribute__((no_sanitize("undefined")))  // this is deliberate
+    really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst)
+    {
+      // store to dest unconditionally - we can overwrite the bits we don't like later
+      dst[0] = src[0];
+      return {src[0]};
+    }
+
+    /* begin file src/generic/stage2/stringparsing.h */
+    // This file contains the common code every implementation uses
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "stringparsing.h" (this simplifies amalgation)
+
+    namespace stage2
+    {
+      namespace stringparsing
+      {
+
+        // begin copypasta
+        // These chars yield themselves: " \ /
+        // b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+        // u not handled in this table as it's complex
+        static const uint8_t escape_map[256] = {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x0.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x22,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x2f,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x4.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x5c,
+            0,
+            0,
+            0, // 0x5.
+            0,
+            0,
+            0x08,
+            0,
+            0,
+            0,
+            0x0c,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x0a,
+            0, // 0x6.
+            0,
+            0,
+            0x0d,
+            0,
+            0x09,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x7.
+
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        };
+
+        // handle a unicode codepoint
+        // write appropriate values into dest
+        // src will advance 6 bytes or 12 bytes
+        // dest will advance a variable amount (return via pointer)
+        // return true if the unicode codepoint was valid
+        // We work in little-endian then swap at write time
+        WARN_UNUSED
+        really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                                    uint8_t **dst_ptr)
+        {
+          // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+          // conversion isn't valid; we defer the check for this to inside the
+          // multilingual plane check
+          uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
+          *src_ptr += 6;
+          // check for low surrogate for characters outside the Basic
+          // Multilingual Plane.
+          if (code_point >= 0xd800 && code_point < 0xdc00)
+          {
+            if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u')
+            {
+              return false;
+            }
+            uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
+
+            // if the first code point is invalid we will get here, as we will go past
+            // the check for being outside the Basic Multilingual plane. If we don't
+            // find a \u immediately afterwards we fail out anyhow, but if we do,
+            // this check catches both the case of the first code point being invalid
+            // or the second code point being invalid.
+            if ((code_point | code_point_2) >> 16)
+            {
+              return false;
+            }
+
+            code_point =
+                (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
+            *src_ptr += 6;
+          }
+          size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
+          *dst_ptr += offset;
+          return offset > 0;
+        }
+
+        WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst)
+        {
+          src++;
+          while (1)
+          {
+            // Copy the next n bytes, and find the backslash and quote in them.
+            auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+            // If the next thing is the end quote, copy and return
+            if (bs_quote.has_quote_first())
+            {
+              // we encountered quotes first. Move dst to point to quotes and exit
+              return dst + bs_quote.quote_index();
+            }
+            if (bs_quote.has_backslash())
+            {
+              /* find out where the backspace is */
+              auto bs_dist = bs_quote.backslash_index();
+              uint8_t escape_char = src[bs_dist + 1];
+              /* we encountered backslash first. Handle backslash */
+              if (escape_char == 'u')
+              {
+                /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+                src += bs_dist;
+                dst += bs_dist;
+                if (!handle_unicode_codepoint(&src, &dst))
+                {
+                  return nullptr;
+                }
+              }
+              else
+              {
+                /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+                uint8_t escape_result = escape_map[escape_char];
+                if (escape_result == 0u)
+                {
+                  return nullptr; /* bogus escape value is an error */
+                }
+                dst[bs_dist] = escape_result;
+                src += bs_dist + 2;
+                dst += bs_dist + 1;
+              }
+            }
+            else
+            {
+              /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+              src += backslash_and_quote::BYTES_PROCESSED;
+              dst += backslash_and_quote::BYTES_PROCESSED;
+            }
+          }
+          /* can't be reached */
+          return nullptr;
+        }
+
+      } // namespace stringparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/stringparsing.h */
+
+  } // namespace fallback
+} // namespace simdjson
+
+#endif // SIMDJSON_FALLBACK_STRINGPARSING_H
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/fallback/numberparsing.h */
+#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_H
+#define SIMDJSON_FALLBACK_NUMBERPARSING_H
+
+/* jsoncharutils.h already included: #include "jsoncharutils.h" */
+/* begin file src/fallback/bitmanipulation.h */
+#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H
+#define SIMDJSON_FALLBACK_BITMANIPULATION_H
+
+#include <limits>
+
+namespace simdjson
+{
+  namespace fallback
+  {
+
+#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
+    static inline unsigned char _BitScanForward64(unsigned long *ret, uint64_t x)
+    {
+      unsigned long x0 = (unsigned long)x, top, bottom;
+      _BitScanForward(&top, (unsigned long)(x >> 32));
+      _BitScanForward(&bottom, x0);
+      *ret = x0 ? bottom : 32 + top;
+      return x != 0;
+    }
+    static unsigned char _BitScanReverse64(unsigned long *ret, uint64_t x)
+    {
+      unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
+      _BitScanReverse(&top, x1);
+      _BitScanReverse(&bottom, (unsigned long)x);
+      *ret = x1 ? top + 32 : bottom;
+      return x != 0;
+    }
 #endif
-/* result might be undefined when input_num is zero */
-really_inline int trailing_zeroes(uint64_t input_num) {
+
+    // We sometimes call trailing_zero on inputs that are zero,
+    // but the algorithms do not end up using the returned value.
+    // Sadly, sanitizers are not smart enough to figure it out.
+    NO_SANITIZE_UNDEFINED
+    really_inline int trailing_zeroes(uint64_t input_num)
+    {
 #ifdef _MSC_VER
-  unsigned long ret;
-  // Search the mask data from least significant bit (LSB) 
-  // to the most significant bit (MSB) for a set bit (1).
-  _BitScanForward64(&ret, input_num);
-  return (int)ret;
-#else
-  return __builtin_ctzll(input_num);
-#endif// _MSC_VER
-}
+      unsigned long ret;
+      // Search the mask data from least significant bit (LSB)
+      // to the most significant bit (MSB) for a set bit (1).
+      _BitScanForward64(&ret, input_num);
+      return (int)ret;
+#else  // _MSC_VER
+      return __builtin_ctzll(input_num);
+#endif // _MSC_VER
+    }
 
-/* result might be undefined when input_num is zero */
-really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
-  return input_num & (input_num-1);
-}
+    /* result might be undefined when input_num is zero */
+    really_inline uint64_t clear_lowest_bit(uint64_t input_num)
+    {
+      return input_num & (input_num - 1);
+    }
 
-/* result might be undefined when input_num is zero */
-really_inline int leading_zeroes(uint64_t input_num) {
+    /* result might be undefined when input_num is zero */
+    really_inline int leading_zeroes(uint64_t input_num)
+    {
 #ifdef _MSC_VER
-  unsigned long leading_zero = 0;
-  // Search the mask data from most significant bit (MSB) 
-  // to least significant bit (LSB) for a set bit (1).
-  if (_BitScanReverse64(&leading_zero, input_num))
-    return (int)(63 - leading_zero);
-  else
-    return 64;
+      unsigned long leading_zero = 0;
+      // Search the mask data from most significant bit (MSB)
+      // to least significant bit (LSB) for a set bit (1).
+      if (_BitScanReverse64(&leading_zero, input_num))
+        return (int)(63 - leading_zero);
+      else
+        return 64;
 #else
-  return __builtin_clzll(input_num);
-#endif// _MSC_VER
-}
+      return __builtin_clzll(input_num);
+#endif // _MSC_VER
+    }
 
-really_inline int count_ones(uint64_t input_num) {
-#ifdef _MSC_VER
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
-#else
-  return _popcnt64(input_num);
+    really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result)
+    {
+      *result = value1 + value2;
+      return *result < value1;
+    }
+
+    really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result)
+    {
+      *result = value1 * value2;
+      // TODO there must be a faster way
+      return value2 > 0 && value1 > std::numeric_limits<uint64_t>::max() / value2;
+    }
+
+  } // namespace fallback
+} // namespace simdjson
+
+#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H
+/* end file src/fallback/bitmanipulation.h */
+#include <cmath>
+#include <limits>
+
+#ifdef JSON_TEST_NUMBERS // for unit testing
+void found_invalid_number(const uint8_t *buf);
+void found_integer(int64_t result, const uint8_t *buf);
+void found_unsigned_integer(uint64_t result, const uint8_t *buf);
+void found_float(double result, const uint8_t *buf);
 #endif
-}
 
-really_inline bool add_overflow(uint64_t value1, uint64_t value2,
-                                uint64_t *result) {
-#ifdef _MSC_VER
-  return _addcarry_u64(0, value1, value2,
-                       reinterpret_cast<unsigned __int64 *>(result));
+namespace simdjson
+{
+  namespace fallback
+  {
+    static inline uint32_t parse_eight_digits_unrolled(const char *chars)
+    {
+      uint32_t result = 0;
+      for (int i = 0; i < 8; i++)
+      {
+        result = result * 10 + (chars[i] - '0');
+      }
+      return result;
+    }
+
+#define SWAR_NUMBER_PARSING
+
+    /* begin file src/generic/stage2/numberparsing.h */
+    namespace stage2
+    {
+      namespace numberparsing
+      {
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), writer.append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), writer.append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), writer.append_double((VALUE)))
 #else
-  return __builtin_uaddll_overflow(value1, value2,
-                                   (unsigned long long *)result);
+#define INVALID_NUMBER(SRC) (false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) writer.append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) writer.append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) writer.append_double((VALUE))
 #endif
-}
 
-#ifdef _MSC_VER
-#pragma intrinsic(_umul128)
+        // Attempts to compute i * 10^(power) exactly; and if "negative" is
+        // true, negate the result.
+        // This function will only work in some cases, when it does not work, success is
+        // set to false. This should work *most of the time* (like 99% of the time).
+        // We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
+        // FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
+        really_inline double compute_float_64(int64_t power, uint64_t i, bool negative, bool *success)
+        {
+          // we start with a fast path
+          // It was described in
+          // Clinger WD. How to read floating point numbers accurately.
+          // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
 #endif
-really_inline bool mul_overflow(uint64_t value1, uint64_t value2,
-                                uint64_t *result) {
-#ifdef _MSC_VER
-  uint64_t high;
-  *result = _umul128(value1, value2, &high);
-  return high;
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+          // We cannot be certain that x/y is rounded to nearest.
+          if (0 <= power && power <= 22 && i <= 9007199254740991)
+          {
 #else
-  return __builtin_umulll_overflow(value1, value2,
-                                   (unsigned long long *)result);
+          if (-22 <= power && power <= 22 && i <= 9007199254740991)
+          {
 #endif
-}
+            // convert the integer into a double. This is lossless since
+            // 0 <= i <= 2^53 - 1.
+            double d = double(i);
+            //
+            // The general idea is as follows.
+            // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+            // 1) Both s and p can be represented exactly as 64-bit floating-point
+            // values
+            // (binary64).
+            // 2) Because s and p can be represented exactly as floating-point values,
+            // then s * p
+            // and s / p will produce correctly rounded values.
+            //
+            if (power < 0)
+            {
+              d = d / power_of_ten[-power];
+            }
+            else
+            {
+              d = d * power_of_ten[power];
+            }
+            if (negative)
+            {
+              d = -d;
+            }
+            *success = true;
+            return d;
+          }
+          // When 22 < power && power <  22 + 16, we could
+          // hope for another, secondary fast path.  It wa
+          // described by David M. Gay in  "Correctly rounded
+          // binary-decimal and decimal-binary conversions." (1990)
+          // If you need to compute i * 10^(22 + x) for x < 16,
+          // first compute i * 10^x, if you know that result is exact
+          // (e.g., when i * 10^x < 2^53),
+          // then you can still proceed and do (i * 10^x) * 10^22.
+          // Is this worth your time?
+          // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+          // for this second fast path to work.
+          // If you you have 22 < power *and* power <  22 + 16, and then you
+          // optimistically compute "i * 10^(x-22)", there is still a chance that you
+          // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+          // this optimization maybe less common than we would like. Source:
+          // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+          // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
 
-} // namespace simdjson::westmere
-UNTARGET_REGION
+          // The fast path has now failed, so we are failing back on the slower path.
 
-#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H
-/* end file src/westmere/bitmanipulation.h */
-/* westmere/intrinsics.h already included: #include "westmere/intrinsics.h" */
+          // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+          // possible, except if i == 0, so we handle i == 0 separately.
+          if (i == 0)
+          {
+            return 0.0;
+          }
 
+          // We are going to need to do some 64-bit arithmetic to get a more precise product.
+          // We use a table lookup approach.
+          components c =
+              power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
+          // safe because
+          // power >= FASTFLOAT_SMALLEST_POWER
+          // and power <= FASTFLOAT_LARGEST_POWER
+          // we recover the mantissa of the power, it has a leading 1. It is always
+          // rounded down.
+          uint64_t factor_mantissa = c.mantissa;
 
+          // We want the most significant bit of i to be 1. Shift if needed.
+          int lz = leading_zeroes(i);
+          i <<= lz;
+          // We want the most significant 64 bits of the product. We know
+          // this will be non-zero because the most significant bit of i is
+          // 1.
+          value128 product = full_multiplication(i, factor_mantissa);
+          uint64_t lower = product.low;
+          uint64_t upper = product.high;
 
-TARGET_WESTMERE
-namespace simdjson::westmere::simd {
+          // We know that upper has at most one leading zero because
+          // both i and  factor_mantissa have a leading one. This means
+          // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
 
-  template<typename Child>
-  struct base {
-    __m128i value;
+          // As long as the first 9 bits of "upper" are not "1", then we
+          // know that we have an exact computed value for the leading
+          // 55 bits because any imprecision would play out as a +1, in
+          // the worst case.
+          if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower))
+          {
+            uint64_t factor_mantissa_low =
+                mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
+            // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
+            // result (three 64-bit values)
+            product = full_multiplication(i, factor_mantissa_low);
+            uint64_t product_low = product.low;
+            uint64_t product_middle2 = product.high;
+            uint64_t product_middle1 = lower;
+            uint64_t product_high = upper;
+            uint64_t product_middle = product_middle1 + product_middle2;
+            if (product_middle < product_middle1)
+            {
+              product_high++; // overflow carry
+            }
+            // We want to check whether mantissa *i + i would affect our result.
+            // This does happen, e.g. with 7.3177701707893310e+15.
+            if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
+                 (product_low + i < product_low)))
+            { // let us be prudent and bail out.
+              *success = false;
+              return 0;
+            }
+            upper = product_high;
+            lower = product_middle;
+          }
+          // The final mantissa should be 53 bits with a leading 1.
+          // We shift it so that it occupies 54 bits with a leading 1.
+          ///////
+          uint64_t upperbit = upper >> 63;
+          uint64_t mantissa = upper >> (upperbit + 9);
+          lz += int(1 ^ upperbit);
 
-    // Zero constructor
-    really_inline base() : value{__m128i()} {}
+          // Here we have mantissa < (1<<54).
 
-    // Conversion from SIMD register
-    really_inline base(const __m128i _value) : value(_value) {}
+          // We have to round to even. The "to even" part
+          // is only a problem when we are right in between two floats
+          // which we guard against.
+          // If we have lots of trailing zeros, we may fall right between two
+          // floating-point values.
+          if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
+                       ((mantissa & 3) == 1)))
+          {
+            // if mantissa & 1 == 1 we might need to round up.
+            //
+            // Scenarios:
+            // 1. We are not in the middle. Then we should round up.
+            //
+            // 2. We are right in the middle. Whether we round up depends
+            // on the last significant bit: if it is "one" then we round
+            // up (round to even) otherwise, we do not.
+            //
+            // So if the last significant bit is 1, we can safely round up.
+            // Hence we only need to bail out if (mantissa & 3) == 1.
+            // Otherwise we may need more accuracy or analysis to determine whether
+            // we are exactly between two floating-point numbers.
+            // It can be triggered with 1e23.
+            // Note: because the factor_mantissa and factor_mantissa_low are
+            // almost always rounded down (except for small positive powers),
+            // almost always should round up.
+            *success = false;
+            return 0;
+          }
 
-    // Conversion to SIMD register
-    really_inline operator const __m128i&() const { return this->value; }
-    really_inline operator __m128i&() { return this->value; }
+          mantissa += mantissa & 1;
+          mantissa >>= 1;
 
-    // Bit operations
-    really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
-    really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
-    really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
-    really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
-    really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; }
-    really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; }
-    really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; }
-  };
+          // Here we have mantissa < (1<<53), unless there was an overflow
+          if (mantissa >= (1ULL << 53))
+          {
+            //////////
+            // This will happen when parsing values such as 7.2057594037927933e+16
+            ////////
+            mantissa = (1ULL << 52);
+            lz--; // undo previous addition
+          }
+          mantissa &= ~(1ULL << 52);
+          uint64_t real_exponent = c.exp - lz;
+          // we have to check that real_exponent is in range, otherwise we bail out
+          if (unlikely((real_exponent < 1) || (real_exponent > 2046)))
+          {
+            *success = false;
+            return 0;
+          }
+          mantissa |= real_exponent << 52;
+          mantissa |= (((uint64_t)negative) << 63);
+          double d;
+          memcpy(&d, &mantissa, sizeof(d));
+          *success = true;
+          return d;
+        } // namespace numberparsing
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename T>
-  struct simd8;
+        static bool parse_float_strtod(const char *ptr, double *outDouble)
+        {
+          char *endptr;
+          *outDouble = strtod(ptr, &endptr);
+          // Some libraries will set errno = ERANGE when the value is subnormal,
+          // yet we may want to be able to parse subnormal values.
+          // However, we do not want to tolerate NAN or infinite values.
+          //
+          // Values like infinity or NaN are not allowed in the JSON specification.
+          // If you consume a large value and you map it to "infinity", you will no
+          // longer be able to serialize back a standard-compliant JSON. And there is
+          // no realistic application where you might need values so large than they
+          // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+          // 10^308 It is an unimaginable large number. There will never be any piece of
+          // engineering involving as many as 10^308 parts. It is estimated that there
+          // are about 10^80 atoms in the universe.  The estimate for the total number
+          // of electrons is similar. Using a double-precision floating-point value, we
+          // can represent easily the number of atoms in the universe. We could  also
+          // represent the number of ways you can pick any three individual atoms at
+          // random in the universe. If you ever encounter a number much larger than
+          // 10^308, you know that you have a bug. RapidJSON will reject a document with
+          // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
+          // will flat out throw an exception.
+          //
+          if ((endptr == ptr) || (!std::isfinite(*outDouble)))
+          {
+            return false;
+          }
+          return true;
+        }
 
-  template<typename T, typename Mask=simd8<bool>>
-  struct base8: base<simd8<T>> {
-    typedef uint16_t bitmask_t;
-    typedef uint32_t bitmask2_t;
+        really_inline bool is_integer(char c)
+        {
+          return (c >= '0' && c <= '9');
+          // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
+        }
 
-    really_inline base8() : base<simd8<T>>() {}
-    really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+        // check quickly whether the next 8 chars are made of digits
+        // at a glance, it looks better than Mula's
+        // http://0x80.pl/articles/swar-digits-validate.html
+        really_inline bool is_made_of_eight_digits_fast(const char *chars)
+        {
+          uint64_t val;
+          // this can read up to 7 bytes beyond the buffer size, but we require
+          // SIMDJSON_PADDING of padding
+          static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+          memcpy(&val, chars, 8);
+          // a branchy method might be faster:
+          // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+          //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+          //  0x3030303030303030);
+          return (((val & 0xF0F0F0F0F0F0F0F0) |
+                   (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+                  0x3333333333333333);
+        }
 
-    really_inline Mask operator==(const simd8<T> other) const { return _mm_cmpeq_epi8(*this, other); }
+        template <typename W>
+        bool slow_float_parsing(UNUSED const char *src, W writer)
+        {
+          double d;
+          if (parse_float_strtod(src, &d))
+          {
+            WRITE_DOUBLE(d, (const uint8_t *)src, writer);
+            return true;
+          }
+          return INVALID_NUMBER((const uint8_t *)src);
+        }
 
-    static const int SIZE = sizeof(base<simd8<T>>::value);
+        really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent)
+        {
+          // we continue with the fiction that we have an integer. If the
+          // floating point number is representable as x * 10^z for some integer
+          // z that fits in 53 bits, then we will be able to convert back the
+          // the integer into a float in a lossless manner.
+          const char *const first_after_period = p;
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          } // There must be at least one digit after the .
 
-    template<int N=1>
-    really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
-    }
-  };
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          ++p;
+          i = i * 10 + digit; // might overflow + multiplication by 10 is likely
+                              // cheaper than arbitrary mult.
+          // we will handle the overflow later
+#ifdef SWAR_NUMBER_PARSING
+          // this helps if we have lots of decimals!
+          // this turns out to be frequent enough.
+          if (is_made_of_eight_digits_fast(p))
+          {
+            i = i * 100000000 + parse_eight_digits_unrolled(p);
+            p += 8;
+          }
+#endif
+          while (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            ++p;
+            i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+                                // because we have parse_highprecision_float later.
+          }
+          exponent = first_after_period - p;
+          return true;
+        }
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base8<bool> {
-    static really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(-(!!_value)); }
+        really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent)
+        {
+          bool neg_exp = false;
+          if ('-' == *p)
+          {
+            neg_exp = true;
+            ++p;
+          }
+          else if ('+' == *p)
+          {
+            ++p;
+          }
 
-    really_inline simd8<bool>() : base8() {}
-    really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
-    // Splat constructor
-    really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+          // e[+-] must be followed by a number
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          }
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          int64_t exp_number = digit;
+          p++;
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          while (is_integer(*p))
+          {
+            // we need to check for overflows; we refuse to parse this
+            if (exp_number > 0x100000000)
+            {
+              return INVALID_NUMBER(src);
+            }
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          exponent += (neg_exp ? -exp_number : exp_number);
+          return true;
+        }
 
-    really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
-    really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
-    really_inline simd8<bool> operator~() const { return *this ^ true; }
-  };
+        template <typename W>
+        really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char *start_digits, int digit_count, int64_t exponent, W &writer)
+        {
+          // If we frequently had to deal with long strings of digits,
+          // we could extend our code by using a 128-bit integer instead
+          // of a 64-bit integer. However, this is uncommon in practice.
+          // digit count is off by 1 because of the decimal (assuming there was one).
+          if (unlikely((digit_count - 1 >= 19)))
+          { // this is uncommon
+            // It is possible that the integer had an overflow.
+            // We have to handle the case where we have 0.0000somenumber.
+            const char *start = start_digits;
+            while ((*start == '0') || (*start == '.'))
+            {
+              start++;
+            }
+            // we over-decrement by one when there is a '.'
+            digit_count -= int(start - start_digits);
+            if (digit_count >= 19)
+            {
+              // Ok, chances are good that we had an overflow!
+              // this is almost never going to get called!!!
+              // we start anew, going slowly!!!
+              // This will happen in the following examples:
+              // 10000000000000000000000000000000000000000000e+308
+              // 3.1415926535897932384626433832795028841971693993751
+              //
+              bool success = slow_float_parsing((const char *)src, writer);
+              // The number was already written, but we made a copy of the writer
+              // when we passed it to the parse_large_integer() function, so
+              writer.skip_double();
+              return success;
+            }
+          }
+          // NOTE: it's weird that the unlikely() only wraps half the if, but it seems to get slower any other
+          // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+          // To future reader: we'd love if someone found a better way, or at least could explain this result!
+          if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER))
+          {
+            // this is almost never going to get called!!!
+            // we start anew, going slowly!!!
+            bool success = slow_float_parsing((const char *)src, writer);
+            // The number was already written, but we made a copy of the writer when we passed it to the
+            // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+            writer.skip_double();
+            return success;
+          }
+          bool success = true;
+          double d = compute_float_64(exponent, i, negative, &success);
+          if (!success)
+          {
+            // we are almost never going to get here.
+            if (!parse_float_strtod((const char *)src, &d))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          WRITE_DOUBLE(d, src, writer);
+          return true;
+        }
 
-  template<typename T>
-  struct base8_numeric: base8<T> {
-    static really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
-    static really_inline simd8<T> zero() { return _mm_setzero_si128(); }
-    static really_inline simd8<T> load(const T values[16]) {
-      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
-    }
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    static really_inline simd8<T> repeat_16(
-      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
-      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
-    ) {
-      return simd8<T>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+        // parse the number at src
+        // define JSON_TEST_NUMBERS for unit testing
+        //
+        // It is assumed that the number is followed by a structural ({,},],[) character
+        // or a white space character. If that is not the case (e.g., when the JSON
+        // document is made of a single number), then it is necessary to copy the
+        // content and append a space before calling this function.
+        //
+        // Our objective is accurate parsing (ULP of 0) at high speed.
+        template <typename W>
+        really_inline bool parse_number(UNUSED const uint8_t *const src,
+                                        UNUSED bool found_minus,
+                                        W &writer)
+        {
+#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes \
+                                  // useful to skip parsing
+          writer.append_s64(0);   // always write zero
+          return true;            // always succeeds
+#else
+          const char *p = reinterpret_cast<const char *>(src);
+          bool negative = false;
+          if (found_minus)
+          {
+            ++p;
+            negative = true;
+            // a negative sign must be followed by an integer
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          const char *const start_digits = p;
 
-    really_inline base8_numeric() : base8<T>() {}
-    really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
+          uint64_t i; // an unsigned int avoids signed overflows (which are bad)
+          if (*p == '0')
+          {
+            ++p;
+            if (is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // 0 cannot be followed by an integer
+            i = 0;
+          }
+          else
+          {
+            // NOTE: This is a redundant check--either we're negative, in which case we checked whether this
+            // is a digit above, or the caller already determined we start with a digit. But removing this
+            // check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
+            // Please do try yourself, or think of ways to explain it--we'd love to understand :)
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // must start with an integer
+            unsigned char digit = static_cast<unsigned char>(*p - '0');
+            i = digit;
+            p++;
+            // the is_made_of_eight_digits_fast routine is unlikely to help here because
+            // we rarely see large integer parts like 123456789
+            while (is_integer(*p))
+            {
+              digit = static_cast<unsigned char>(*p - '0');
+              // a multiplication by 10 is cheaper than an arbitrary integer
+              // multiplication
+              i = 10 * i + digit; // might overflow, we will handle the overflow later
+              ++p;
+            }
+          }
 
-    // Store to array
-    really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+          //
+          // Handle floats if there is a . or e (or both)
+          //
+          int64_t exponent = 0;
+          bool is_float = false;
+          if ('.' == *p)
+          {
+            is_float = true;
+            ++p;
+            if (!parse_decimal(src, p, i, exponent))
+            {
+              return false;
+            }
+          }
+          int digit_count = int(p - start_digits); // used later to guard against overflows
+          if (('e' == *p) || ('E' == *p))
+          {
+            is_float = true;
+            ++p;
+            if (!parse_exponent(src, p, exponent))
+            {
+              return false;
+            }
+          }
+          if (is_float)
+          {
+            return write_float(src, negative, i, start_digits, digit_count, exponent, writer);
+          }
 
-    // Override to distinguish from bool version
-    really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+          // The longest negative 64-bit number is 19 digits.
+          // The longest positive 64-bit number is 20 digits.
+          // We do it this way so we don't trigger this branch unless we must.
+          int longest_digit_count = negative ? 19 : 20;
+          if (digit_count > longest_digit_count)
+          {
+            return INVALID_NUMBER(src);
+          }
+          if (digit_count == longest_digit_count)
+          {
+            // Anything negative above INT64_MAX is either invalid or INT64_MIN.
+            if (negative && i > uint64_t(INT64_MAX))
+            {
+              // If the number is negative and can't fit in a signed integer, it's invalid.
+              if (i > uint64_t(INT64_MAX) + 1)
+              {
+                return INVALID_NUMBER(src);
+              }
 
-    // Addition/subtraction are the same for signed and unsigned
-    really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
-    really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
-    really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *(simd8<T>*)this; }
-    really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *(simd8<T>*)this; }
+              // If it's negative, it has to be INT64_MAX+1 now (or INT64_MIN).
+              // C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
+              WRITE_INTEGER(INT64_MIN, src, writer);
+              return is_structural_or_whitespace(*p);
+            }
 
-    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
-    template<typename L>
-    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return _mm_shuffle_epi8(lookup_table, *this);
-    }
+            // Positive overflow check:
+            // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+            //   biggest uint64_t.
+            // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+            //   If we got here, it's a 20 digit number starting with the digit "1".
+            // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+            //   than 1,553,255,926,290,448,384.
+            // - That is smaller than the smallest possible 20-digit number the user could write:
+            //   10,000,000,000,000,000,000.
+            // - Therefore, if the number is positive and lower than that, it's overflow.
+            // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
+            //
+            if (!negative && (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
 
-    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
-    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
-    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
-    // get written.
-    // Design consideration: it seems like a function with the
-    // signature simd8<L> compress(uint32_t mask) would be
-    // sensible, but the AVX ISA makes this kind of approach difficult.
-    template<typename L>
-    really_inline void compress(uint16_t mask, L * output) const {
-      // this particular implementation was inspired by work done by @animetosho
-      // we do it in two steps, first 8 bytes and then second 8 bytes
-      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
-      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
-      // next line just loads the 64-bit values thintable_epi8[mask1] and
-      // thintable_epi8[mask2] into a 128-bit register, using only
-      // two instructions on most compilers.
-      __m128i shufmask =  _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
-      // we increment by 0x08 the second half of the mask
-      shufmask =
-      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
-      // this is the version "nearly pruned"
-      __m128i pruned = _mm_shuffle_epi8(*this, shufmask);
-      // we still need to put the two halves together.
-      // we compute the popcount of the first half:
-      int pop1 = BitsSetTable256mul2[mask1];
-      // then load the corresponding mask, what it does is to write
-      // only the first pop1 bytes from the first 8 bytes, and then
-      // it fills in with the bytes from the second 8 bytes + some filling
-      // at the end.
-      __m128i compactmask =
-      _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8));
-      __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
-      _mm_storeu_si128(( __m128i *)(output), answer);
-    }
+          // Write unsigned if it doesn't fit in a signed integer.
+          if (i > uint64_t(INT64_MAX))
+          {
+            WRITE_UNSIGNED(i, src, writer);
+          }
+          else
+          {
+            WRITE_INTEGER(negative ? 0 - i : i, src, writer);
+          }
+          return is_structural_or_whitespace(*p);
 
-    template<typename L>
-    really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
-  };
+#endif // SIMDJSON_SKIPNUMBERPARSING
+        }
 
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> : base8_numeric<int8_t> {
-    really_inline simd8() : base8_numeric<int8_t>() {}
-    really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
-    // Splat constructor
-    really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
-    // Array constructor
-    really_inline simd8(const int8_t* values) : simd8(load(values)) {}
-    // Member-by-member initialization
-    really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(_mm_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+      } // namespace numberparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/numberparsing.h */
 
-    // Order-sensitive comparisons
-    really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
-    really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
-    really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
-    really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
-  };
+  } // namespace fallback
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base8_numeric<uint8_t> {
-    really_inline simd8() : base8_numeric<uint8_t>() {}
-    really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
-    // Splat constructor
-    really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
-    // Array constructor
-    really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
-    // Member-by-member initialization
-    really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(_mm_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
-    }
+} // namespace simdjson
 
-    // Saturated math
-    really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
-    really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
+#endif // SIMDJSON_FALLBACK_NUMBERPARSING_H
+/* end file src/generic/stage2/numberparsing.h */
 
-    // Order-specific operations
-    really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
-    really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
-    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-    really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
-    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-    really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
-    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; }
-    really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; }
-    really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
-    really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+namespace simdjson
+{
+  namespace fallback
+  {
 
-    // Bit-specific operations
-    really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
-    really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
-    really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
-    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
-    really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
-    really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-    really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
-    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
-    template<int N>
-    really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
-    template<int N>
-    really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
-    // Get one of the bits and make a bitmask out of it.
-    // e.g. value.get_bit<7>() gets the high bit
-    template<int N>
-    really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
-  };
+    /* begin file src/generic/stage2/logger.h */
+    // This is for an internal-only stage 2 specific logger.
+    // Set LOG_ENABLED = true to log what stage 2 is doing!
+    namespace logger
+    {
+      static constexpr const char *DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
 
-  template<typename T>
-  struct simd8x64 {
-    static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
-    const simd8<T> chunks[NUM_CHUNKS];
+      static constexpr const bool LOG_ENABLED = false;
+      static constexpr const int LOG_EVENT_LEN = 30;
+      static constexpr const int LOG_BUFFER_LEN = 20;
+      static constexpr const int LOG_DETAIL_LEN = 50;
+      static constexpr const int LOG_INDEX_LEN = 10;
 
-    really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
-    really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
+      static int log_depth; // Not threadsafe. Log only.
 
-    really_inline void store(T ptr[64]) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
-    }
+      // Helper to turn unprintable or newline characters into spaces
+      static really_inline char printable_char(char c)
+      {
+        if (c >= 0x20)
+        {
+          return c;
+        }
+        else
+        {
+          return ' ';
+        }
+      }
 
-    really_inline void compress(uint64_t mask, T * output) const {
-      this->chunks[0].compress(mask, output);
-      this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
-      this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
-      this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
-    }
+      // Print the header and set up log_start
+      static really_inline void log_start()
+      {
+        if (LOG_ENABLED)
+        {
+          log_depth = 0;
+          printf("\n");
+          printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+          printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN + 2, DASHES, LOG_BUFFER_LEN + 2, DASHES, 4 + 2, DASHES, 4 + 2, DASHES, 5 + 2, DASHES, 5 + 2, DASHES, LOG_DETAIL_LEN + 2, DASHES, LOG_INDEX_LEN + 2, DASHES);
+        }
+      }
 
-    template <typename F>
-    static really_inline void each_index(F const& each) {
-      each(0);
-      each(1);
-      each(2);
-      each(3);
-    }
+      static really_inline void log_string(const char *message)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("%s\n", message);
+        }
+      }
 
-    template <typename F>
-    really_inline void each(F const& each_chunk) const
+      // Logs a single line of
+      template <typename S>
+      static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("| %*s%s%-*s ", log_depth * 2, "", title_prefix, LOG_EVENT_LEN - log_depth * 2 - int(strlen(title_prefix)), title);
+          {
+            // Print the next N characters in the buffer.
+            printf("| ");
+            // Otherwise, print the characters starting from the buffer position.
+            // Print spaces for unprintable or newline characters.
+            for (int i = 0; i < LOG_BUFFER_LEN; i++)
+            {
+              printf("%c", printable_char(structurals.current()[i]));
+            }
+            printf(" ");
+          }
+          printf("|    %c ", printable_char(structurals.current_char()));
+          printf("|    %c ", printable_char(structurals.peek_next_char()));
+          printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural + 1)]);
+          printf("| %5u ", structurals.next_tape_index());
+          printf("| %-*s ", LOG_DETAIL_LEN, detail);
+          printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+          printf("|\n");
+        }
+      }
+    } // namespace logger
+
+    /* end file src/generic/stage2/logger.h */
+    /* begin file src/generic/stage2/atomparsing.h */
+    namespace stage2
     {
-      each_chunk(this->chunks[0]);
-      each_chunk(this->chunks[1]);
-      each_chunk(this->chunks[2]);
-      each_chunk(this->chunks[3]);
-    }
+      namespace atomparsing
+      {
 
-    template <typename F, typename R=bool>
-    really_inline simd8x64<R> map(F const& map_chunk) const {
-      return simd8x64<R>(
-        map_chunk(this->chunks[0]),
-        map_chunk(this->chunks[1]),
-        map_chunk(this->chunks[2]),
-        map_chunk(this->chunks[3])
-      );
-    }
+        really_inline uint32_t string_to_uint32(const char *str) { return *reinterpret_cast<const uint32_t *>(str); }
 
-    template <typename F, typename R=bool>
-    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
-      return simd8x64<R>(
-        map_chunk(this->chunks[0], b.chunks[0]),
-        map_chunk(this->chunks[1], b.chunks[1]),
-        map_chunk(this->chunks[2], b.chunks[2]),
-        map_chunk(this->chunks[3], b.chunks[3])
-      );
-    }
+        WARN_UNUSED
+        really_inline uint32_t str4ncmp(const uint8_t *src, const char *atom)
+        {
+          uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+          static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+          std::memcpy(&srcval, src, sizeof(uint32_t));
+          return srcval ^ string_to_uint32(atom);
+        }
 
-    template <typename F>
-    really_inline simd8<T> reduce(F const& reduce_pair) const {
-      return reduce_pair(
-        reduce_pair(this->chunks[0], this->chunks[1]),
-        reduce_pair(this->chunks[2], this->chunks[3])
-      );
-    }
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
 
-    really_inline uint64_t to_bitmask() const {
-      uint64_t r0 = static_cast<uint32_t>(this->chunks[0].to_bitmask());
-      uint64_t r1 =                       this->chunks[1].to_bitmask();
-      uint64_t r2 =                       this->chunks[2].to_bitmask();
-      uint64_t r3 =                       this->chunks[3].to_bitmask();
-      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_true_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "true");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-    really_inline simd8x64<T> bit_or(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a | mask; } );
-    }
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src + 1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
+        }
 
-    really_inline uint64_t eq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a == mask; } ).to_bitmask();
-    }
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 5)
+          {
+            return is_valid_false_atom(src);
+          }
+          else if (len == 5)
+          {
+            return !str4ncmp(src + 1, "alse");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-    really_inline uint64_t lteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
-    }
-  }; // struct simd8x64<T>
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
 
-} // namespace simdjson::westmere::simd
-UNTARGET_REGION
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_null_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "null");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
-/* end file src/westmere/bitmanipulation.h */
-/* westmere/bitmanipulation.h already included: #include "westmere/bitmanipulation.h" */
-/* westmere/implementation.h already included: #include "westmere/implementation.h" */
+      } // namespace atomparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/atomparsing.h */
+    /* begin file src/generic/stage2/structural_iterator.h */
+    namespace stage2
+    {
 
-TARGET_WESTMERE
-namespace simdjson::westmere {
+      class structural_iterator
+      {
+      public:
+        const uint8_t *const buf;
+        uint32_t *current_structural;
+        dom_parser_implementation &parser;
 
-using namespace simd;
+        // Start a structural
+        really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+            : buf{_parser.buf},
+              current_structural{&_parser.structural_indexes[start_structural_index]},
+              parser{_parser}
+        {
+        }
+        // Get the buffer position of the current structural character
+        really_inline const uint8_t *current()
+        {
+          return &buf[*current_structural];
+        }
+        // Get the current structural character
+        really_inline char current_char()
+        {
+          return buf[*current_structural];
+        }
+        // Get the next structural character without advancing
+        really_inline char peek_next_char()
+        {
+          return buf[*(current_structural + 1)];
+        }
+        really_inline char advance_char()
+        {
+          current_structural++;
+          return buf[*current_structural];
+        }
+        really_inline size_t remaining_len()
+        {
+          return parser.len - *current_structural;
+        }
 
-struct json_character_block {
-  static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
+        really_inline bool past_end(uint32_t n_structural_indexes)
+        {
+          return current_structural >= &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_end(uint32_t n_structural_indexes)
+        {
+          return current_structural == &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_beginning()
+        {
+          return current_structural == parser.structural_indexes.get();
+        }
+      };
 
-  really_inline uint64_t whitespace() const { return _whitespace; }
-  really_inline uint64_t op() const { return _op; }
-  really_inline uint64_t scalar() { return ~(op() | whitespace()); }
+    } // namespace stage2
+    /* end file src/generic/stage2/structural_iterator.h */
+    /* begin file src/generic/stage2/structural_parser.h */
+    // This file contains the common code every implementation uses for stage2
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "simdjson/stage2.h" (this simplifies amalgation)
 
-  uint64_t _whitespace;
-  uint64_t _op;
-};
+    namespace stage2
+    {
+      namespace
+      { // Make everything here private
 
-really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
-  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
-  // we can't use the generic lookup_16.
-  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
-  auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
+        /* begin file src/generic/stage2/tape_writer.h */
+        struct tape_writer
+        {
+          /** The next place to write to tape */
+          uint64_t *next_tape_loc;
 
-  // We compute whitespace and op separately. If the code later only use one or the
-  // other, given the fact that all functions are aggressively inlined, we can
-  // hope that useless computations will be omitted. This is namely case when
-  // minifying (we only need whitespace).
+          /** Write a signed 64-bit value to tape. */
+          really_inline void append_s64(int64_t value) noexcept;
 
-  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
-    return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in));
-  }).to_bitmask();
+          /** Write an unsigned 64-bit value to tape. */
+          really_inline void append_u64(uint64_t value) noexcept;
 
-  uint64_t op = in.map([&](simd8<uint8_t> _in) {
-    // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
-    return (_in | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, _in-','));
-  }).to_bitmask();
-  return { whitespace, op };
-}
+          /** Write a double value to tape. */
+          really_inline void append_double(double value) noexcept;
 
-really_inline bool is_ascii(simd8x64<uint8_t> input) {
-  simd8<uint8_t> bits = input.reduce([&](auto a,auto b) { return a|b; });
-  return !bits.any_bits_set_anywhere(0b10000000u);
-}
+          /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+          really_inline void append(uint64_t val, internal::tape_type t) noexcept;
 
-really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
-}
+          /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+          really_inline void skip() noexcept;
 
-/* begin file src/generic/buf_block_reader.h */
-// Walks through a buffer in block-sized increments, loading the last part with spaces
-template<size_t STEP_SIZE>
-struct buf_block_reader {
-public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-  really_inline size_t block_index() { return idx; }
-  really_inline bool has_full_block() const {
-    return idx < lenminusstep;
-  }
-  really_inline const uint8_t *full_block() const {
-    return &buf[idx];
-  }
-  really_inline bool has_remainder() const {
-    return idx < len;
-  }
-  really_inline void get_remainder(uint8_t *tmp_buf) const {
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-  }
-  really_inline void advance() {
-    idx += STEP_SIZE;
-  }
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
+          /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+          really_inline void skip_large_integer() noexcept;
 
-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  in.store((uint8_t*)buf);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
+          /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+          really_inline void skip_double() noexcept;
 
-UNUSED static char * format_mask(uint64_t mask) {
-  static char *buf = (char*)malloc(64 + 1);
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
-}
-/* end file src/generic/buf_block_reader.h */
-/* begin file src/generic/json_string_scanner.h */
-namespace stage1 {
+          /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+          really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
 
-struct json_string_block {
-  // Escaped characters (characters following an escape() character)
-  really_inline uint64_t escaped() const { return _escaped; }
-  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
-  really_inline uint64_t escape() const { return _backslash & ~_escaped; }
-  // Real (non-backslashed) quotes
-  really_inline uint64_t quote() const { return _quote; }
-  // Start quotes of strings
-  really_inline uint64_t string_end() const { return _quote & _in_string; }
-  // End quotes of strings
-  really_inline uint64_t string_start() const { return _quote & ~_in_string; }
-  // Only characters inside the string (not including the quotes)
-  really_inline uint64_t string_content() const { return _in_string & ~_quote; }
-  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
-  really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
-  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
-  really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
-  // Tail of string (everything except the start quote)
-  really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+        private:
+          /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+          template <typename T>
+          really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+        }; // struct number_writer
 
-  // backslash characters
-  uint64_t _backslash;
-  // escaped characters (backslashed--does not include the hex characters after \u)
-  uint64_t _escaped;
-  // real quotes (non-backslashed ones)
-  uint64_t _quote;
-  // string characters (includes start quote but not end quote)
-  uint64_t _in_string;
-};
+        really_inline void tape_writer::append_s64(int64_t value) noexcept
+        {
+          append2(0, value, internal::tape_type::INT64);
+        }
 
-// Scans blocks for string characters, storing the state necessary to do so
-class json_string_scanner {
-public:
-  really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
-  really_inline error_code finish(bool streaming);
+        really_inline void tape_writer::append_u64(uint64_t value) noexcept
+        {
+          append(0, internal::tape_type::UINT64);
+          *next_tape_loc = value;
+          next_tape_loc++;
+        }
 
-private:
-  really_inline uint64_t find_escaped(uint64_t escape);
+        /** Write a double value to tape. */
+        really_inline void tape_writer::append_double(double value) noexcept
+        {
+          append2(0, value, internal::tape_type::DOUBLE);
+        }
 
-  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
-  uint64_t prev_in_string = 0ULL;
-  // Whether the first character of the next iteration is escaped.
-  uint64_t prev_escaped = 0ULL;
-};
+        really_inline void tape_writer::skip() noexcept
+        {
+          next_tape_loc++;
+        }
 
-//
-// Finds escaped characters (characters following \).
-//
-// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
-//
-// Does this by:
-// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
-// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
-// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
-//
-// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
-// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
-// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
-// the start bit causes a carry), and leaves even-bit sequences alone.
-//
-// Example:
-//
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
-// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
-// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
-// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
-// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
-// escaped        |   x  | x x  x x  x x  x  x  |
-// desired        |   x  | x x  x x  x x  x  x  |
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-//
-really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
-  // If there was overflow, pretend the first character isn't a backslash
-  backslash &= ~prev_escaped;
-  uint64_t follows_escape = backslash << 1 | prev_escaped;
+        really_inline void tape_writer::skip_large_integer() noexcept
+        {
+          next_tape_loc += 2;
+        }
 
-  // Get sequences starting on even bits by clearing out the odd series using +
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
-  uint64_t sequences_starting_on_even_bits;
-  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
-  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+        really_inline void tape_writer::skip_double() noexcept
+        {
+          next_tape_loc += 2;
+        }
 
-  // Mask every other backslashed character as an escaped character
-  // Flip the mask for sequences that start on even bits, to correct them
-  return (even_bits ^ invert_mask) & follows_escape;
-}
+        really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept
+        {
+          *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+          next_tape_loc++;
+        }
 
-//
-// Return a mask of all string characters plus end quotes.
-//
-// prev_escaped is overflow saying whether the next character is escaped.
-// prev_in_string is overflow saying whether we're still in a string.
-//
-// Backslash sequences outside of quotes will be detected in stage 2.
-//
-really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
-  const uint64_t backslash = in.eq('\\');
-  const uint64_t escaped = find_escaped(backslash);
-  const uint64_t quote = in.eq('"') & ~escaped;
-  // prefix_xor flips on bits inside the string (and flips off the end quote).
-  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
-  // (characters inside strings are outside, and characters outside strings are inside).
-  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
-  // right shift of a signed value expected to be well-defined and standard
-  // compliant as of C++20, John Regher from Utah U. says this is fine code
-  prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
-  // Use ^ to turn the beginning quote off, and the end quote on.
-  return {
-    backslash,
-    escaped,
-    quote,
-    in_string
-  };
-}
+        template <typename T>
+        really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept
+        {
+          append(val, t);
+          static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+          memcpy(next_tape_loc, &val2, sizeof(val2));
+          next_tape_loc++;
+        }
 
-really_inline error_code json_string_scanner::finish(bool streaming) {
-  if (prev_in_string and (not streaming)) {
-    return UNCLOSED_STRING;
+        really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept
+        {
+          tape_loc = val | ((uint64_t(char(t))) << 56);
+        }
+        /* end file src/generic/stage2/tape_writer.h */
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+#define INIT_ADDRESSES()                                                                  \
+  {                                                                                       \
+    &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue \
   }
-  return SUCCESS;
-}
+#define GOTO(address) \
+  {                   \
+    goto *(address);  \
+  }
+#define CONTINUE(address) \
+  {                       \
+    goto *(address);      \
+  }
+#else // SIMDJSON_USE_COMPUTED_GOTO
+#define INIT_ADDRESSES() {'[', 'a', 'e', 'f', '{', 'o'};
+#define GOTO(address)       \
+  {                         \
+    switch (address)        \
+    {                       \
+    case '[':               \
+      goto array_begin;     \
+    case 'a':               \
+      goto array_continue;  \
+    case 'e':               \
+      goto error;           \
+    case 'f':               \
+      goto finish;          \
+    case '{':               \
+      goto object_begin;    \
+    case 'o':               \
+      goto object_continue; \
+    }                       \
+  }
+// For the more constrained end_xxx() situation
+#define CONTINUE(address)   \
+  {                         \
+    switch (address)        \
+    {                       \
+    case 'a':               \
+      goto array_continue;  \
+    case 'o':               \
+      goto object_continue; \
+    case 'f':               \
+      goto finish;          \
+    }                       \
+  }
+#endif // SIMDJSON_USE_COMPUTED_GOTO
 
-} // namespace stage1
-/* end file src/generic/json_string_scanner.h */
-/* begin file src/generic/json_scanner.h */
-namespace stage1 {
+        struct unified_machine_addresses
+        {
+          ret_address_t array_begin;
+          ret_address_t array_continue;
+          ret_address_t error;
+          ret_address_t finish;
+          ret_address_t object_begin;
+          ret_address_t object_continue;
+        };
 
-/**
- * A block of scanned json, with information on operators and scalars.
- */
-struct json_block {
-public:
-  /** The start of structurals */
-  really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
-  /** All JSON whitespace (i.e. not in a string) */
-  really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
+#undef FAIL_IF
+#define FAIL_IF(EXPR)         \
+  {                           \
+    if (EXPR)                 \
+    {                         \
+      return addresses.error; \
+    }                         \
+  }
 
-  // Helpers
+        struct structural_parser : structural_iterator
+        {
+          /** Lets you append to the tape */
+          tape_writer tape;
+          /** Next write location in the string buf for stage 2 parsing */
+          uint8_t *current_string_buf_loc;
+          /** Current depth (nested objects and arrays) */
+          uint32_t depth{0};
 
-  /** Whether the given characters are inside a string (only works on non-quotes) */
-  really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
-  /** Whether the given characters are outside a string (only works on non-quotes) */
-  really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
+          // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+          really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+              : structural_iterator(_parser, start_structural_index),
+                tape{parser.doc->tape.get()},
+                current_string_buf_loc{parser.doc->string_buf.get()}
+          {
+          }
 
-  // string and escape characters
-  json_string_block _string;
-  // whitespace, operators, scalars
-  json_character_block _characters;
-  // whether the previous character was a scalar
-  uint64_t _follows_potential_scalar;
-private:
-  // Potential structurals (i.e. disregarding strings)
+          WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state)
+          {
+            parser.containing_scope[depth].tape_index = next_tape_index();
+            parser.containing_scope[depth].count = 0;
+            tape.skip(); // We don't actually *write* the start element until the end.
+            parser.ret_address[depth] = continue_state;
+            depth++;
+            bool exceeded_max_depth = depth >= parser.max_depth();
+            if (exceeded_max_depth)
+            {
+              log_error("Exceeded max depth!");
+            }
+            return exceeded_max_depth;
+          }
 
-  /** operators plus scalar starts like 123, true and "abc" */
-  really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
-  /** the start of non-operator runs, like 123, true and "abc" */
-  really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
-  /** whether the given character is immediately after a non-operator like 123, true or " */
-  really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
-};
+          WARN_UNUSED really_inline bool start_document(ret_address_t continue_state)
+          {
+            log_start_value("document");
+            return start_scope(continue_state);
+          }
 
-/**
- * Scans JSON for important bits: operators, strings, and scalars.
- *
- * The scanner starts by calculating two distinct things:
- * - string characters (taking \" into account)
- * - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
- *
- * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
- * in particular, the operator/scalar bit will find plenty of things that are actually part of
- * strings. When we're done, json_block will fuse the two together by masking out tokens that are
- * part of a string.
- */
-class json_scanner {
-public:
-  really_inline json_block next(const simd::simd8x64<uint8_t> in);
-  really_inline error_code finish(bool streaming);
+          WARN_UNUSED really_inline bool start_object(ret_address_t continue_state)
+          {
+            log_start_value("object");
+            return start_scope(continue_state);
+          }
 
-private:
-  // Whether the last character of the previous iteration is part of a scalar token
-  // (anything except whitespace or an operator).
-  uint64_t prev_scalar = 0ULL;
-  json_string_scanner string_scanner;
-};
+          WARN_UNUSED really_inline bool start_array(ret_address_t continue_state)
+          {
+            log_start_value("array");
+            return start_scope(continue_state);
+          }
 
+          // this function is responsible for annotating the start of the scope
+          really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept
+          {
+            depth--;
+            // write our doc->tape location to the header scope
+            // The root scope gets written *at* the previous location.
+            tape.append(parser.containing_scope[depth].tape_index, end);
+            // count can overflow if it exceeds 24 bits... so we saturate
+            // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+            const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+            const uint32_t count = parser.containing_scope[depth].count;
+            const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+            // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+            tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+          }
 
-//
-// Check if the current character immediately follows a matching character.
-//
-// For example, this checks for quotes with backslashes in front of them:
-//
-//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
-//
-really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
-  const uint64_t result = match << 1 | overflow;
-  overflow = match >> 63;
-  return result;
-}
+          really_inline uint32_t next_tape_index()
+          {
+            return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+          }
 
-//
-// Check if the current character follows a matching character, with possible "filler" between.
-// For example, this checks for empty curly braces, e.g. 
-//
-//     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
-//
-really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) {
-  uint64_t follows_match = follows(match, overflow);
-  uint64_t result;
-  overflow |= uint64_t(add_overflow(follows_match, filler, &result));
-  return result;
-}
+          really_inline void end_object()
+          {
+            log_end_value("object");
+            end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+          }
+          really_inline void end_array()
+          {
+            log_end_value("array");
+            end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+          }
+          really_inline void end_document()
+          {
+            log_end_value("document");
+            end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+          }
 
-really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
-  json_string_block strings = string_scanner.next(in);
-  json_character_block characters = json_character_block::classify(in);
-  uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
-  return {
-    strings,
-    characters,
-    follows_scalar
-  };
-}
+          // increment_count increments the count of keys in an object or values in an array.
+          // Note that if you are at the level of the values or elements, the count
+          // must be increment in the preceding depth (depth-1) where the array or
+          // the object resides.
+          really_inline void increment_count()
+          {
+            parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+          }
 
-really_inline error_code json_scanner::finish(bool streaming) {
-  return string_scanner.finish(streaming);
-}
+          really_inline uint8_t *on_start_string() noexcept
+          {
+            // we advance the point, accounting for the fact that we have a NULL termination
+            tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+            return current_string_buf_loc + sizeof(uint32_t);
+          }
 
-} // namespace stage1
-/* end file src/generic/json_scanner.h */
+          really_inline void on_end_string(uint8_t *dst) noexcept
+          {
+            uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+            // TODO check for overflow in case someone has a crazy string (>=4GB?)
+            // But only add the overflow check when the document itself exceeds 4GB
+            // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+            memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+            // NULL termination is still handy if you expect all your strings to
+            // be NULL terminated? It comes at a small cost
+            *dst = 0;
+            current_string_buf_loc = dst + 1;
+          }
 
-/* begin file src/generic/json_minifier.h */
-// This file contains the common code every implementation uses in stage1
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is included already includes
-// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+          WARN_UNUSED really_inline bool parse_string(bool key = false)
+          {
+            log_value(key ? "key" : "string");
+            uint8_t *dst = on_start_string();
+            dst = stringparsing::parse_string(current(), dst);
+            if (dst == nullptr)
+            {
+              log_error("Invalid escape in string");
+              return true;
+            }
+            on_end_string(dst);
+            return false;
+          }
 
-namespace stage1 {
+          WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus)
+          {
+            log_value("number");
+            bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+            if (!succeeded)
+            {
+              log_error("Invalid number");
+            }
+            return !succeeded;
+          }
+          WARN_UNUSED really_inline bool parse_number(bool found_minus)
+          {
+            return parse_number(current(), found_minus);
+          }
 
-class json_minifier {
-public:
-  template<size_t STEP_SIZE>
-  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+          really_inline bool parse_number_with_space_terminated_copy(const bool is_negative)
+          {
+            /**
+    * We need to make a copy to make sure that the string is space terminated.
+    * This is not about padding the input, which should already padded up
+    * to len + SIMDJSON_PADDING. However, we have no control at this stage
+    * on how the padding was done. What if the input string was padded with nulls?
+    * It is quite common for an input string to have an extra null character (C string).
+    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+    * document, but the string "9\0" by itself is fine. So we make a copy and
+    * pad the input with spaces when we know that there is just one input element.
+    * This copy is relatively expensive, but it will almost never be called in
+    * practice unless you are in the strange scenario where you have many JSON
+    * documents made of single atoms.
+    */
+            uint8_t *copy = static_cast<uint8_t *>(malloc(parser.len + SIMDJSON_PADDING));
+            if (copy == nullptr)
+            {
+              return true;
+            }
+            memcpy(copy, buf, parser.len);
+            memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+            size_t idx = *current_structural;
+            bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
+            free(copy);
+            return result;
+          }
+          WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state)
+          {
+            switch (advance_char())
+            {
+            case '"':
+              FAIL_IF(parse_string());
+              return continue_state;
+            case 't':
+              log_value("true");
+              FAIL_IF(!atomparsing::is_valid_true_atom(current()));
+              tape.append(0, internal::tape_type::TRUE_VALUE);
+              return continue_state;
+            case 'f':
+              log_value("false");
+              FAIL_IF(!atomparsing::is_valid_false_atom(current()));
+              tape.append(0, internal::tape_type::FALSE_VALUE);
+              return continue_state;
+            case 'n':
+              log_value("null");
+              FAIL_IF(!atomparsing::is_valid_null_atom(current()));
+              tape.append(0, internal::tape_type::NULL_VALUE);
+              return continue_state;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+              FAIL_IF(parse_number(false));
+              return continue_state;
+            case '-':
+              FAIL_IF(parse_number(true));
+              return continue_state;
+            case '{':
+              FAIL_IF(start_object(continue_state));
+              return addresses.object_begin;
+            case '[':
+              FAIL_IF(start_array(continue_state));
+              return addresses.array_begin;
+            default:
+              log_error("Non-value found when value was expected!");
+              return addresses.error;
+            }
+          }
 
-private:
-  really_inline json_minifier(uint8_t *_dst) : dst{_dst} {}
-  template<size_t STEP_SIZE>
-  really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
-  really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
-  really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
-  json_scanner scanner;
-  uint8_t *dst;
-};
+          WARN_UNUSED really_inline error_code finish()
+          {
+            end_document();
+            parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
 
-really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
-  uint64_t mask = block.whitespace();
-  in.compress(mask, dst);
-  dst += 64 - count_ones(mask);
-}
+            if (depth != 0)
+            {
+              log_error("Unclosed objects or arrays!");
+              return parser.error = TAPE_ERROR;
+            }
 
-really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
-  *dst = '\0';
-  error_code error = scanner.finish(false);
-  if (error) { dst_len = 0; return error; }
-  dst_len = dst - dst_start;
-  return SUCCESS;
-}
+            return SUCCESS;
+          }
 
-template<>
-really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block_buf);
-  simd::simd8x64<uint8_t> in_2(block_buf+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1);
-  this->next(in_2, block_2);
-  reader.advance();
-}
+          WARN_UNUSED really_inline error_code error()
+          {
+            /* We do not need the next line because this is done by parser.init_stage2(),
+    * pessimistically.
+    * parser.is_valid  = false;
+    * At this point in the code, we have all the time in the world.
+    * Note that we know exactly where we are in the document so we could,
+    * without any overhead on the processing code, report a specific
+    * location.
+    * We could even trigger special code paths to assess what happened
+    * carefully,
+    * all without any added cost. */
+            if (depth >= parser.max_depth())
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            switch (current_char())
+            {
+            case '"':
+              return parser.error = STRING_ERROR;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '-':
+              return parser.error = NUMBER_ERROR;
+            case 't':
+              return parser.error = T_ATOM_ERROR;
+            case 'n':
+              return parser.error = N_ATOM_ERROR;
+            case 'f':
+              return parser.error = F_ATOM_ERROR;
+            default:
+              return parser.error = TAPE_ERROR;
+            }
+          }
 
-template<>
-really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block_buf);
-  json_block block_1 = scanner.next(in_1);
-  this->next(block_buf, block_1);
-  reader.advance();
-}
+          really_inline void init()
+          {
+            log_start();
+            parser.error = UNINITIALIZED;
+          }
 
-template<size_t STEP_SIZE>
-error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_minifier minifier(dst);
-  while (reader.has_full_block()) {
-    minifier.step<STEP_SIZE>(reader.full_block(), reader);
-  }
+          WARN_UNUSED really_inline error_code start(ret_address_t finish_state)
+          {
+            // If there are no structurals left, return EMPTY
+            if (at_end(parser.n_structural_indexes))
+            {
+              return parser.error = EMPTY;
+            }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    minifier.step<STEP_SIZE>(block, reader);
+            init();
+            // Push the root scope (there is always at least one scope)
+            if (start_document(finish_state))
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            return SUCCESS;
+          }
+
+          really_inline void log_value(const char *type)
+          {
+            logger::log_line(*this, "", type, "");
+          }
+
+          static really_inline void log_start()
+          {
+            logger::log_start();
+          }
+
+          really_inline void log_start_value(const char *type)
+          {
+            logger::log_line(*this, "+", type, "");
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth++;
+            }
+          }
+
+          really_inline void log_end_value(const char *type)
+          {
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth--;
+            }
+            logger::log_line(*this, "-", type, "");
+          }
+
+          really_inline void log_error(const char *error)
+          {
+            logger::log_line(*this, "", "ERROR", error);
+          }
+        }; // struct structural_parser
+
+// Redefine FAIL_IF to use goto since it'll be used inside the function now
+#undef FAIL_IF
+#define FAIL_IF(EXPR) \
+  {                   \
+    if (EXPR)         \
+    {                 \
+      goto error;     \
+    }                 \
   }
 
-  return minifier.finish(dst, dst_len);
-}
+        template <bool STREAMING>
+        WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept
+        {
+          dom_parser.doc = &doc;
+          static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+          stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+          error_code result = parser.start(addresses.finish);
+          if (result)
+          {
+            return result;
+          }
 
-} // namespace stage1
-/* end file src/generic/json_minifier.h */
-WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
-  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
-}
+          //
+          // Read first value
+          //
+          switch (parser.current_char())
+          {
+          case '{':
+            FAIL_IF(parser.start_object(addresses.finish));
+            goto object_begin;
+          case '[':
+            FAIL_IF(parser.start_array(addresses.finish));
+            // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+            // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+            if (!STREAMING)
+            {
+              if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']')
+              {
+                goto error;
+              }
+            }
+            goto array_begin;
+          case '"':
+            FAIL_IF(parser.parse_string());
+            goto finish;
+          case 't':
+            parser.log_value("true");
+            FAIL_IF(!atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+            goto finish;
+          case 'f':
+            parser.log_value("false");
+            FAIL_IF(!atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+            goto finish;
+          case 'n':
+            parser.log_value("null");
+            FAIL_IF(!atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::NULL_VALUE);
+            goto finish;
+          case '0':
+          case '1':
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(false))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          case '-':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(true))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          default:
+            parser.log_error("Document starts with a non-value character");
+            goto error;
+          }
 
-/* begin file src/generic/utf8_lookup2_algorithm.h */
-//
-// Detect Unicode errors.
-//
-// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
-// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
-// are straight up concatenated into the final value. The first byte of a multibyte character is a
-// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
-// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
-// start with 0, because that's what ASCII looks like. Here's what each size looks like:
-//
-// - ASCII (7 bits):              0_______
-// - 2 byte character (11 bits):  110_____ 10______
-// - 3 byte character (17 bits):  1110____ 10______ 10______
-// - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
-// - 5+ byte character (illegal): 11111___ <illegal>
-//
-// There are 5 classes of error that can happen in Unicode:
-//
-// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
-//   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
-//   character.
-//
-//   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
-//
-// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
-//   We detect this by requiring that the next byte after your multibyte character be a new
-//   character--so a continuation after your character is wrong.
-//
-//   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
-//
-// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
-//
-//   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
-//
-// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
-//   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
-//   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
-//
-//   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
-//
-// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
-//   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
-//
-//   e.g. 11101101 10100000 10000000 (U+D800)
-//
-// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
-//   support values with more than 23 bits (which a 4-byte character supports).
-//
-//   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//   
-// Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-// 
-//   Code Points        1st       2s       3s       4s
-//  U+0000..U+007F     00..7F
-//  U+0080..U+07FF     C2..DF   80..BF
-//  U+0800..U+0FFF     E0       A0..BF   80..BF
-//  U+1000..U+CFFF     E1..EC   80..BF   80..BF
-//  U+D000..U+D7FF     ED       80..9F   80..BF
-//  U+E000..U+FFFF     EE..EF   80..BF   80..BF
-//  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
-//  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
-//  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
-//
-using namespace simd;
+        //
+        // Object parser states
+        //
+        object_begin:
+          switch (parser.advance_char())
+          {
+          case '"':
+          {
+            parser.increment_count();
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          }
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("Object does not start with a key");
+            goto error;
+          }
 
-namespace utf8_validation {
+        object_key_state:
+          if (parser.advance_char() != ':')
+          {
+            parser.log_error("Missing colon after key in object");
+            goto error;
+          }
+          GOTO(parser.parse_value(addresses, addresses.object_continue));
 
-  //
-  // Find special case UTF-8 errors where the character is technically readable (has the right length)
-  // but the *value* is disallowed.
-  //
-  // This includes overlong encodings, surrogates and values too large for Unicode.
-  //
-  // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
-  // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
-  // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
-  // If all 3 lookups detect the same error, it's an error.
-  //
-  really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-    //
-    // These are the errors we're going to match for bytes 1-2, by looking at the first three
-    // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
-    //
-    static const int OVERLONG_2  = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
-    static const int OVERLONG_3  = 0x02; // 11100000 100_____ ________
-    static const int OVERLONG_4  = 0x04; // 11110000 1000____ ________ ________
-    static const int SURROGATE   = 0x08; // 11101101 [101_]____
-    static const int TOO_LARGE   = 0x10; // 11110100 (1001|101_)____
-    static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
+        object_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            if (parser.advance_char() != '"')
+            {
+              parser.log_error("Key string missing at beginning of field in object");
+              goto error;
+            }
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("No comma between object fields");
+            goto error;
+          }
 
-    // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
-    // byte 2 to be sure which things are errors and which aren't.
-    // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
-    static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
-    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ASCII: ________ [0___]____
-        CARRY, CARRY, CARRY, CARRY,
-        // ASCII: ________ [0___]____
-        CARRY, CARRY, CARRY, CARRY,
-        // Continuations: ________ [10__]____
-        CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
-        CARRY | OVERLONG_3 | TOO_LARGE,  // ________ [1001]____
-        CARRY | TOO_LARGE  | SURROGATE,  // ________ [1010]____
-        CARRY | TOO_LARGE  | SURROGATE,  // ________ [1011]____
-        // Multibyte Leads: ________ [11__]____
-        CARRY, CARRY, CARRY, CARRY
-    );
+        scope_end:
+          CONTINUE(parser.parser.ret_address[parser.depth]);
 
-    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // [0___]____ (ASCII)
-      0, 0, 0, 0,                          
-      0, 0, 0, 0,
-      // [10__]____ (continuation)
-      0, 0, 0, 0,
-      // [11__]____ (2+-byte leads)
-      OVERLONG_2, 0,                       // [110_]____ (2-byte lead)
-      OVERLONG_3 | SURROGATE,              // [1110]____ (3-byte lead)
-      OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead)
-    );
+        //
+        // Array parser states
+        //
+        array_begin:
+          if (parser.peek_next_char() == ']')
+          {
+            parser.advance_char();
+            parser.end_array();
+            goto scope_end;
+          }
+          parser.increment_count();
 
-    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____[00__] ________
-      OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
-      OVERLONG_2,                           // ____[0001] ________
-      0, 0,
-      // ____[01__] ________
-      TOO_LARGE,                            // ____[0100] ________
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      // ____[10__] ________
-      TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
-      // ____[11__] ________
-      TOO_LARGE_2,
-      TOO_LARGE_2 | SURROGATE,                            // ____[1101] ________
-      TOO_LARGE_2, TOO_LARGE_2
-    );
+        main_array_switch:
+          /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+          GOTO(parser.parse_value(addresses, addresses.array_continue));
 
-    return byte_1_high & byte_1_low & byte_2_high;
-  }
+        array_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            goto main_array_switch;
+          case ']':
+            parser.end_array();
+            goto scope_end;
+          default:
+            parser.log_error("Missing comma between array values");
+            goto error;
+          }
 
-  //
-  // Validate the length of multibyte characters (that each multibyte character has the right number
-  // of continuation characters, and that all continuation characters are part of a multibyte
-  // character).
-  //
-  // Algorithm
-  // =========
-  //
-  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
-  // and emits an error anytime there is a mismatch.
-  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
-  // characters, the file will look like this:
-  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
-  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
-  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
-  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
-  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
-  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
-  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
-  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  //
-  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
-  //
-  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
-  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
-  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
-  //   or maybe it's at the beginning of the file before any characters have started; but it's an
-  //   error in all these cases.
-  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
-  //   we started a new character before we were finished with the current one.
-  //
-  // Getting the Previous Bytes
-  // --------------------------
-  //
-  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
-  // character, we need to "shift the bytes" to find that out. This is what they mean:
-  //
-  // - `is_continuation`: if the current byte is a continuation.
-  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
-  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
-  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
-  //
-  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
-  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
-  // function, because the 1-byte-back data is used by other checks as well.
-  //
-  // Getting the Continuation Mask
-  // -----------------------------
-  //
-  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
-  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
-  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
-  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
-  //
-  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
-  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
-  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
-  //
-  // When treated as signed numbers, they look like this:
-  //
-  // | Type         | High Bits  | Binary Range | Signed |
-  // |--------------|------------|--------------|--------|
-  // | ASCII        | `0`        | `01111111`   |   127  |
-  // |              |            | `00000000`   |     0  |
-  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
-  // |              |            | `11110000    |   -16  |
-  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
-  // |              |            | `11100000    |   -32  |
-  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
-  // |              |            | `11000000    |   -64  |
-  // | Continuation | `10`       | `10111111`   |   -65  |
-  // |              |            | `10000000    |  -128  |
-  //
-  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
-  //
-  // ```
-  // is_continuation = input < -64`
-  // ```
-  //
-  // We can do something similar for the others, but it takes two comparisons instead of one: "is
-  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
-  // `> -64`. Surely we can do better, they're right next to each other!
-  //
-  // Getting the is_xxx Masks: Shifting the Range
-  // --------------------------------------------
-  //
-  // Notice *why* continuations were a single comparison. The actual *range* would require two
-  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
-  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
-  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
-  //
-  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
-  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
-  //
-  // | Type                 | High Bits  | Binary Range | Signed |
-  // |----------------------|------------|--------------|-------|
-  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
-  // |                      |            | `01110000    |   112 |
-  // |----------------------|------------|--------------|-------|
-  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
-  // |                      |            | `01100000    |    96 |
-  // |----------------------|------------|--------------|-------|
-  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
-  // |                      |            | `01000000    |    64 |
-  // |----------------------|------------|--------------|-------|
-  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
-  // |                      |            | `00000000    |     0 |
-  // |----------------------|------------|--------------|-------|
-  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
-  // |                      |            | `10000000`   |  -128 |
-  // |----------------------|------------|--------------|-------|
-  // 
-  // *Now* we can use signed `>` on all of them:
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev2 = input.prev<2>
-  // prev3 = input.prev<3>
-  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
-  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
-  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
-  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
-  // ```
-  //
-  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
-  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
-  // `^`'s at a time on Haswell, but only 2 `+`'s).
-  //
-  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
-  // same number to all of them, we can save one of those `+ 128` operations by assembling
-  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
-  // to it. One more instruction saved!
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev3 = input.prev<3>
-  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
-  // ```
-  //
-  // ### Bringing It All Together: Detecting the Errors
-  //
-  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
-  // All we have left to do is check if they match!
-  //
-  // ```
-  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
-  // ```
-  //
-  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
-  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
-  // bitwise operations, and we're only using 1!
-  //
-  // Epilogue: Addition For Booleans
-  // -------------------------------
-  //
-  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
-  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
-  // to each other (or any combination), and the continuation could be part of either of them!
-  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
-  //
-  // Never fear, though. If that situation occurs, we'll already have detected that the second
-  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
-  // character, but it *wasn't a continuation*.
-  //
-  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
-  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
-  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
-  // comparisons were giving us numbers!
-  //
-  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
-  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
-  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
-  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
-  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
-  //
-  // Further, if *more than one* multibyte character overlaps,
-  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
-  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
-  // considered an error.
-  //
-  // One reason you might want to do this is parallelism. ^ and | are not associative, so
-  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
-  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
-  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
-  // then adds the result together. Same number of operations, but if the processor can run
-  // independent things in parallel (which most can), it runs faster.
-  //
-  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
-  // a super nice advantage in that more of them can be run at the same time (they can run on 3
-  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
-  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
-  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
-  // out of the way," running on a port other instructions can't.
-  // 
-  // Epilogue II: One More Trick
-  // ---------------------------
-  //
-  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
-  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
-  // check_special_cases()--but we'll talk about that there :)
-  //
-  really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
-    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+        finish:
+          return parser.finish();
 
-    // Cont is 10000000-101111111 (-65...-128)
-    simd8<bool> is_continuation = simd8<int8_t>(input) < int8_t(-64);
-    // must_be_continuation is architecture-specific because Intel doesn't have unsigned comparisons
-    return simd8<uint8_t>(must_be_continuation(prev1, prev2, prev3) ^ is_continuation);
-  }
+        error:
+          return parser.error();
+        }
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
-    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
-    // ... 1111____ 111_____ 11______
-    static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
-    };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
-    return input.gt_bits(max_value);
-  }
+      } // namespace
+    }   // namespace stage2
 
-  struct utf8_checker {
-    // If this is nonzero, there has been a UTF-8 error.
-    simd8<uint8_t> error;
-    // The last input we received
-    simd8<uint8_t> prev_input_block;
-    // Whether the last input we received was incomplete (used for ASCII fast path)
-    simd8<uint8_t> prev_incomplete;
+    /************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+    WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept
+    {
+      error_code result = stage2::parse_structurals<false>(*this, _doc);
+      if (result)
+      {
+        return result;
+      }
 
-    //
-    // Check whether the current bytes are valid UTF-8.
-    //
-    really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      this->error |= check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, prev1);
+      // If we didn't make it to the end, it's an error
+      if (next_structural_index != n_structural_indexes)
+      {
+        logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+        return error = TAPE_ERROR;
+      }
+
+      return SUCCESS;
     }
 
-    // The only problem that can happen at EOF is that a multibyte character is too short.
-    really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    /************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+    WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept
+    {
+      return stage2::parse_structurals<true>(*this, _doc);
     }
+    /* end file src/generic/stage2/tape_writer.h */
 
-    really_inline void check_next_input(simd8x64<uint8_t> input) {
-      if (likely(is_ascii(input))) {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error |= this->prev_incomplete;
-      } else {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
-          this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
-        }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+    WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept
+    {
+      error_code err = stage1(_buf, _len, false);
+      if (err)
+      {
+        return err;
       }
+      return stage2(_doc);
     }
 
-    really_inline error_code errors() {
-      return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
-    }
+  } // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+#endif
+#if SIMDJSON_IMPLEMENTATION_HASWELL
+/* begin file src/haswell/implementation.cpp */
+/* haswell/implementation.h already included: #include "haswell/implementation.h" */
+/* begin file src/haswell/dom_parser_implementation.h */
+#ifndef SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
 
-  }; // struct utf8_checker
-}
+/* isadetection.h already included: #include "isadetection.h" */
 
-using utf8_validation::utf8_checker;
-/* end file src/generic/utf8_lookup2_algorithm.h */
-/* begin file src/generic/json_structural_indexer.h */
-// This file contains the common code every implementation uses in stage1
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is included already includes
-// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+namespace simdjson
+{
+  namespace haswell
+  {
 
-namespace stage1 {
+    /* begin file src/generic/dom_parser_implementation.h */
+    // expectation: sizeof(scope_descriptor) = 64/8.
+    struct scope_descriptor
+    {
+      uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+      uint32_t count;      // how many elements in the scope
+    };                     // struct scope_descriptor
 
-class bit_indexer {
-public:
-  uint32_t *tail;
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+    typedef void *ret_address_t;
+#else
+    typedef char ret_address_t;
+#endif
 
-  really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+    class dom_parser_implementation final : public internal::dom_parser_implementation
+    {
+    public:
+      /** Tape location of each open { or [ */
+      std::unique_ptr<scope_descriptor[]> containing_scope{};
+      /** Return address of each open { or [ */
+      std::unique_ptr<ret_address_t[]> ret_address{};
+      /** Buffer passed to stage 1 */
+      const uint8_t *buf{};
+      /** Length passed to stage 1 */
+      size_t len{0};
+      /** Document passed to stage 2 */
+      dom::document *doc{};
+      /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+      error_code error{UNINITIALIZED};
 
-  // flatten out values in 'bits' assuming that they are are to have values of idx
-  // plus their position in the bitvector, and store these indexes at
-  // base_ptr[base] incrementing base as we go
-  // will potentially store extra values beyond end of valid bits, so base_ptr
-  // needs to be large enough to handle this
-  really_inline void write(uint32_t idx, uint64_t bits) {
-    // In some instances, the next branch is expensive because it is mispredicted.
-    // Unfortunately, in other cases,
-    // it helps tremendously.
-    if (bits == 0)
-        return;
-    uint32_t cnt = count_ones(bits);
+      really_inline dom_parser_implementation();
+      dom_parser_implementation(const dom_parser_implementation &) = delete;
+      dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
 
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
+      WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+      WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+      WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+      WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
+    };
 
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
+    /* begin file src/generic/stage1/allocate.h */
+    namespace stage1
+    {
+      namespace allocate
+      {
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (unlikely(cnt > 16)) {
-        uint32_t i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
+        //
+        // Allocates stage 1 internal state and outputs in the parser
+        //
+        really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity)
+        {
+          size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+          parser.structural_indexes.reset(new (std::nothrow) uint32_t[max_structures]);
+          if (!parser.structural_indexes)
+          {
+            return MEMALLOC;
+          }
+          parser.structural_indexes[0] = 0;
+          parser.n_structural_indexes = 0;
+          return SUCCESS;
+        }
 
-    this->tail += cnt;
-  }
-};
+      } // namespace allocate
+    }   // namespace stage1
+    /* end file src/generic/stage1/allocate.h */
+    /* begin file src/generic/stage2/allocate.h */
+    namespace stage2
+    {
+      namespace allocate
+      {
 
-class json_structural_indexer {
-public:
-  template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
+        //
+        // Allocates stage 2 internal state and outputs in the parser
+        //
+        really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth)
+        {
+          parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+          parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
 
-private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
-  template<size_t STEP_SIZE>
-  really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
-  really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
+          if (!parser.ret_address || !parser.containing_scope)
+          {
+            return MEMALLOC;
+          }
+          return SUCCESS;
+        }
 
-  json_scanner scanner;
-  utf8_checker checker{};
-  bit_indexer indexer;
-  uint64_t prev_structurals = 0;
-  uint64_t unescaped_chars_error = 0;
-};
+      } // namespace allocate
+    }   // namespace stage2
+    /* end file src/generic/stage2/allocate.h */
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(idx-64, prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
+    really_inline dom_parser_implementation::dom_parser_implementation() {}
 
-really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
-  // Write out the final iteration's structurals
-  indexer.write(idx-64, prev_structurals);
+    // Leaving these here so they can be inlined if so desired
+    WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept
+    {
+      error_code err = stage1::allocate::set_capacity(*this, capacity);
+      if (err)
+      {
+        _capacity = 0;
+        return err;
+      }
+      _capacity = capacity;
+      return SUCCESS;
+    }
 
-  error_code error = scanner.finish(streaming);
-  if (unlikely(error != SUCCESS)) { return error; }
+    WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept
+    {
+      error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+      if (err)
+      {
+        _max_depth = 0;
+        return err;
+      }
+      _max_depth = max_depth;
+      return SUCCESS;
+    }
+    /* end file src/generic/stage2/allocate.h */
 
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
-  }
+  } // namespace haswell
+} // namespace simdjson
 
-  parser.n_structural_indexes = indexer.tail - parser.structural_indexes.get();
-  /* a valid JSON file cannot have zero structural indexes - we should have
-   * found something */
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
-    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending character. */
-    parser.structural_indexes[parser.n_structural_indexes++] = len;
-  }
-  /* make it safe to dereference one beyond this array */
-  parser.structural_indexes[parser.n_structural_indexes] = 0;
-  return checker.errors();
-}
+#endif // SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
 
-template<>
-really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  simd::simd8x64<uint8_t> in_2(block+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1, reader.block_index());
-  this->next(in_2, block_2, reader.block_index()+64);
-  reader.advance();
-}
+TARGET_HASWELL
 
-template<>
-really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  json_block block_1 = scanner.next(in_1);
-  this->next(in_1, block_1, reader.block_index());
-  reader.advance();
-}
+namespace simdjson
+{
+  namespace haswell
+  {
 
+    WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+        size_t capacity,
+        size_t max_depth,
+        std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept
+    {
+      dst.reset(new (std::nothrow) dom_parser_implementation());
+      if (!dst)
+      {
+        return MEMALLOC;
+      }
+      dst->set_capacity(capacity);
+      dst->set_max_depth(max_depth);
+      return SUCCESS;
+    }
+
+  } // namespace haswell
+} // namespace simdjson
+
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/haswell/dom_parser_implementation.cpp */
+/* haswell/implementation.h already included: #include "haswell/implementation.h" */
+/* haswell/dom_parser_implementation.h already included: #include "haswell/dom_parser_implementation.h" */
+
 //
-// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+// Stage 1
 //
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-// 
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
-// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
-// you may want to call on a function like trimmed_length_safe_utf8.
-template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
-  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+/* begin file src/haswell/bitmask.h */
+#ifndef SIMDJSON_HASWELL_BITMASK_H
+#define SIMDJSON_HASWELL_BITMASK_H
 
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_structural_indexer indexer(parser.structural_indexes.get());
-  while (reader.has_full_block()) {
-    indexer.step<STEP_SIZE>(reader.full_block(), reader);
-  }
+/* begin file src/haswell/intrinsics.h */
+#ifndef SIMDJSON_HASWELL_INTRINSICS_H
+#define SIMDJSON_HASWELL_INTRINSICS_H
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    indexer.step<STEP_SIZE>(block, reader);
-  }
+#ifdef SIMDJSON_VISUAL_STUDIO
+// under clang within visual studio, this will include <x86intrin.h>
+#include <intrin.h> // visual studio or clang
+#else
+#include <x86intrin.h> // elsewhere
+#endif                 // SIMDJSON_VISUAL_STUDIO
 
-  return indexer.finish(parser, reader.block_index(), len, streaming);
+#ifdef SIMDJSON_CLANG_VISUAL_STUDIO
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ * e.g., if __AVX2__ is set... in turn,  we normally set these
+ * macros by compiling against the corresponding architecture
+ * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
+ * software with these advanced instructions. In simdjson, we
+ * want to compile the whole program for a generic target,
+ * and only target our specific kernels. As a workaround,
+ * we directly include the needed headers. These headers would
+ * normally guard against such usage, but we carefully included
+ * <x86intrin.h>  (or <intrin.h>) before, so the headers
+ * are fooled.
+ */
+#include <bmiintrin.h>   // for _blsr_u64
+#include <lzcntintrin.h> // for  __lzcnt64
+#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <wmmintrin.h> // for  _mm_clmulepi64_si128
+// unfortunately, we may not get _blsr_u64, but, thankfully, clang
+// has it as a macro.
+#ifndef _blsr_u64
+// we roll our own
+TARGET_HASWELL
+static really_inline uint64_t simdjson_blsr_u64(uint64_t n)
+{
+  return (n - 1) & n;
 }
+UNTARGET_REGION
+#define _blsr_u64(a) (simdjson_blsr_u64((a)))
+#endif //  _blsr_u64
+#endif
 
-} // namespace stage1
-/* end file src/generic/json_structural_indexer.h */
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
-}
+#endif // SIMDJSON_HASWELL_INTRINSICS_H
+/* end file src/haswell/intrinsics.h */
 
-} // namespace simdjson::westmere
-UNTARGET_REGION
+TARGET_HASWELL
+namespace simdjson
+{
+  namespace haswell
+  {
 
-#endif // SIMDJSON_WESTMERE_STAGE1_FIND_MARKS_H
-/* end file src/generic/json_structural_indexer.h */
-#endif
-/* end file src/generic/json_structural_indexer.h */
-/* begin file src/stage2_build_tape.cpp */
-#include <cassert>
-#include <cstring>
-/* begin file src/jsoncharutils.h */
-#ifndef SIMDJSON_JSONCHARUTILS_H
-#define SIMDJSON_JSONCHARUTILS_H
+    //
+    // Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+    //
+    // For example, prefix_xor(00100100) == 00011100
+    //
+    really_inline uint64_t prefix_xor(const uint64_t bitmask)
+    {
+      // There should be no such thing with a processor supporting avx2
+      // but not clmul.
+      __m128i all_ones = _mm_set1_epi8('\xFF');
+      __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+      return _mm_cvtsi128_si64(result);
+    }
 
+  } // namespace haswell
 
-namespace simdjson {
-// structural chars here are
-// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
-// we are also interested in the four whitespace characters
-// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
+} // namespace simdjson
+UNTARGET_REGION
 
-// these are the chars that can follow a true/false/null or number atom
-// and nothing else
-const uint32_t structural_or_whitespace_or_null_negated[256] = {
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+#endif // SIMDJSON_HASWELL_BITMASK_H
+/* end file src/haswell/intrinsics.h */
+/* begin file src/haswell/simd.h */
+#ifndef SIMDJSON_HASWELL_SIMD_H
+#define SIMDJSON_HASWELL_SIMD_H
 
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+/* simdprune_tables.h already included: #include "simdprune_tables.h" */
+/* begin file src/haswell/bitmanipulation.h */
+#ifndef SIMDJSON_HASWELL_BITMANIPULATION_H
+#define SIMDJSON_HASWELL_BITMANIPULATION_H
 
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
 
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+TARGET_HASWELL
+namespace simdjson
+{
+  namespace haswell
+  {
 
-// return non-zero if not a structural or whitespace char
-// zero otherwise
-really_inline uint32_t is_not_structural_or_whitespace_or_null(uint8_t c) {
-  return structural_or_whitespace_or_null_negated[c];
-}
+    // We sometimes call trailing_zero on inputs that are zero,
+    // but the algorithms do not end up using the returned value.
+    // Sadly, sanitizers are not smart enough to figure it out.
+    NO_SANITIZE_UNDEFINED
+    really_inline int trailing_zeroes(uint64_t input_num)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      return (int)_tzcnt_u64(input_num);
+#else  // SIMDJSON_REGULAR_VISUAL_STUDIO
+      ////////
+      // You might expect the next line to be equivalent to
+      // return (int)_tzcnt_u64(input_num);
+      // but the generated code differs and might be less efficient?
+      ////////
+      return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+    }
 
-const uint32_t structural_or_whitespace_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+    /* result might be undefined when input_num is zero */
+    really_inline uint64_t clear_lowest_bit(uint64_t input_num)
+    {
+      return _blsr_u64(input_num);
+    }
 
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+    /* result might be undefined when input_num is zero */
+    really_inline int leading_zeroes(uint64_t input_num)
+    {
+      return int(_lzcnt_u64(input_num));
+    }
 
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+    really_inline unsigned __int64 count_ones(uint64_t input_num)
+    {
+      // note: we do not support legacy 32-bit Windows
+      return __popcnt64(input_num); // Visual Studio wants two underscores
+    }
+#else
+    really_inline long long int count_ones(uint64_t input_num)
+    {
+      return _popcnt64(input_num);
+    }
+#endif
 
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    really_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                    uint64_t *result)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      return _addcarry_u64(0, value1, value2,
+                           reinterpret_cast<unsigned __int64 *>(result));
+#else
+      return __builtin_uaddll_overflow(value1, value2,
+                                       (unsigned long long *)result);
+#endif
+    }
 
-// return non-zero if not a structural or whitespace char
-// zero otherwise
-really_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
-  return structural_or_whitespace_negated[c];
-}
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#pragma intrinsic(_umul128)
+#endif
+    really_inline bool mul_overflow(uint64_t value1, uint64_t value2,
+                                    uint64_t *result)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      uint64_t high;
+      *result = _umul128(value1, value2, &high);
+      return high;
+#else
+      return __builtin_umulll_overflow(value1, value2,
+                                       (unsigned long long *)result);
+#endif
+    }
 
-const uint32_t structural_or_whitespace_or_null[256] = {
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  } // namespace haswell
+} // namespace simdjson
+UNTARGET_REGION
 
-really_inline uint32_t is_structural_or_whitespace_or_null(uint8_t c) {
-  return structural_or_whitespace_or_null[c];
-}
+#endif // SIMDJSON_HASWELL_BITMANIPULATION_H
+/* end file src/haswell/bitmanipulation.h */
+/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
 
-const uint32_t structural_or_whitespace[256] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+TARGET_HASWELL
+namespace simdjson
+{
+  namespace haswell
+  {
+    namespace simd
+    {
 
-really_inline uint32_t is_structural_or_whitespace(uint8_t c) {
-  return structural_or_whitespace[c];
-}
+      // Forward-declared so they can be used by splat and friends.
+      template <typename Child>
+      struct base
+      {
+        __m256i value;
 
-const uint32_t digit_to_val32[886] = {
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0x0,        0x1,        0x2,        0x3,        0x4,        0x5,
-    0x6,        0x7,        0x8,        0x9,        0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
-    0xb,        0xc,        0xd,        0xe,        0xf,        0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xa,        0xb,        0xc,        0xd,        0xe,
-    0xf,        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0x0,        0x10,       0x20,       0x30,       0x40,       0x50,
-    0x60,       0x70,       0x80,       0x90,       0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
-    0xb0,       0xc0,       0xd0,       0xe0,       0xf0,       0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xa0,       0xb0,       0xc0,       0xd0,       0xe0,
-    0xf0,       0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0x0,        0x100,      0x200,      0x300,      0x400,      0x500,
-    0x600,      0x700,      0x800,      0x900,      0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
-    0xb00,      0xc00,      0xd00,      0xe00,      0xf00,      0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xa00,      0xb00,      0xc00,      0xd00,      0xe00,
-    0xf00,      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0x0,        0x1000,     0x2000,     0x3000,     0x4000,     0x5000,
-    0x6000,     0x7000,     0x8000,     0x9000,     0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
-    0xb000,     0xc000,     0xd000,     0xe000,     0xf000,     0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xa000,     0xb000,     0xc000,     0xd000,     0xe000,
-    0xf000,     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
-    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
-// returns a value with the high 16 bits set if not valid
-// otherwise returns the conversion of the 4 hex digits at src into the bottom
-// 16 bits of the 32-bit return register
-//
-// see
-// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
-static inline uint32_t hex_to_u32_nocheck(
-    const uint8_t *src) { // strictly speaking, static inline is a C-ism
-  uint32_t v1 = digit_to_val32[630 + src[0]];
-  uint32_t v2 = digit_to_val32[420 + src[1]];
-  uint32_t v3 = digit_to_val32[210 + src[2]];
-  uint32_t v4 = digit_to_val32[0 + src[3]];
-  return v1 | v2 | v3 | v4;
-}
+        // Zero constructor
+        really_inline base() : value{__m256i()} {}
 
-// returns true if the provided byte value is a 
-// "continuing" UTF-8 value, that is, if it starts with
-// 0b10...
-static inline bool is_utf8_continuing(char c) {
-  // in 2 complement's notation, values start at 0b10000 (-128)... and
-  // go up to 0b11111 (-1)... so we want all values from -128 to -65 (which is 0b10111111)
-  return ((signed char)c) <= -65;
-}
+        // Conversion from SIMD register
+        really_inline base(const __m256i _value) : value(_value) {}
 
+        // Conversion to SIMD register
+        really_inline operator const __m256i &() const { return this->value; }
+        really_inline operator __m256i &() { return this->value; }
 
+        // Bit operations
+        really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
+        really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
+        really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
+        really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
+        really_inline Child &operator|=(const Child other)
+        {
+          auto this_cast = (Child *)this;
+          *this_cast = *this_cast | other;
+          return *this_cast;
+        }
+        really_inline Child &operator&=(const Child other)
+        {
+          auto this_cast = (Child *)this;
+          *this_cast = *this_cast & other;
+          return *this_cast;
+        }
+        really_inline Child &operator^=(const Child other)
+        {
+          auto this_cast = (Child *)this;
+          *this_cast = *this_cast ^ other;
+          return *this_cast;
+        }
+      };
 
-// given a code point cp, writes to c
-// the utf-8 code, outputting the length in
-// bytes, if the length is zero, the code point
-// is invalid
-//
-// This can possibly be made faster using pdep
-// and clz and table lookups, but JSON documents
-// have few escaped code points, and the following
-// function looks cheap.
-//
-// Note: we assume that surrogates are treated separately
-//
-inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
-  if (cp <= 0x7F) {
-    c[0] = cp;
-    return 1; // ascii
-  }
-  if (cp <= 0x7FF) {
-    c[0] = (cp >> 6) + 192;
-    c[1] = (cp & 63) + 128;
-    return 2; // universal plane
-    //  Surrogates are treated elsewhere...
-    //} //else if (0xd800 <= cp && cp <= 0xdfff) {
-    //  return 0; // surrogates // could put assert here
-  } else if (cp <= 0xFFFF) {
-    c[0] = (cp >> 12) + 224;
-    c[1] = ((cp >> 6) & 63) + 128;
-    c[2] = (cp & 63) + 128;
-    return 3;
-  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
-                               // is not needed
-    c[0] = (cp >> 18) + 240;
-    c[1] = ((cp >> 12) & 63) + 128;
-    c[2] = ((cp >> 6) & 63) + 128;
-    c[3] = (cp & 63) + 128;
-    return 4;
-  }
-  // will return 0 when the code point was too large.
-  return 0; // bad r
-}
+      // Forward-declared so they can be used by splat and friends.
+      template <typename T>
+      struct simd8;
 
+      template <typename T, typename Mask = simd8<bool>>
+      struct base8 : base<simd8<T>>
+      {
+        typedef uint32_t bitmask_t;
+        typedef uint64_t bitmask2_t;
 
-////
-// The following code is used in number parsing. It is not
-// properly "char utils" stuff, but we move it here so that
-// it does not get copied multiple times in the binaries (once
-// per instructin set).
-///
+        really_inline base8() : base<simd8<T>>() {}
+        really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
 
+        really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); }
 
-constexpr int FASTFLOAT_SMALLEST_POWER = -325;
-constexpr int FASTFLOAT_LARGEST_POWER = 308;
+        static const int SIZE = sizeof(base<T>::value);
 
-struct value128 {
-  uint64_t low;
-  uint64_t high;
-};
+        template <int N = 1>
+        really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+        {
+          return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+        }
+      };
 
-really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
-  value128 answer;
-#ifdef _MSC_VER
-  // todo: this might fail under visual studio for ARM
-  answer.low = _umul128(value1, value2, &answer.high);
-#else
-  __uint128_t r = ((__uint128_t)value1) * value2;
-  answer.low = r;
-  answer.high = r >> 64;
-#endif
-  return answer;
-}
+      // SIMD byte mask type (returned by things like eq and gt)
+      template <>
+      struct simd8<bool> : base8<bool>
+      {
+        static really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
 
-// Precomputed powers of ten from 10^0 to 10^22. These
-// can be represented exactly using the double type.
-static const double power_of_ten[] = {
-    1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
-    1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+        really_inline simd8<bool>() : base8() {}
+        really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
+        // Splat constructor
+        really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
 
-// the mantissas of powers of ten from -308 to 308, extended out to sixty four
-// bits
-// This struct will likely get padded to 16 bytes.
-typedef struct {
-  uint64_t mantissa;
-  int32_t exp;
-} components;
+        really_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); }
+        really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+        really_inline simd8<bool> operator~() const { return *this ^ true; }
+      };
 
-// The array power_of_ten_components contain the powers of ten approximated
-// as a 64-bit mantissa, with an exponent part. It goes from 10^
-// FASTFLOAT_SMALLEST_POWER to
-// 10^FASTFLOAT_LARGEST_POWER (inclusively). The mantissa is truncated, and
-// never rounded up.
-// Uses about 10KB.
-static const components power_of_ten_components[] = {
-    {0xa5ced43b7e3e9188L, 7},    {0xcf42894a5dce35eaL, 10},
-    {0x818995ce7aa0e1b2L, 14},   {0xa1ebfb4219491a1fL, 17},
-    {0xca66fa129f9b60a6L, 20},   {0xfd00b897478238d0L, 23},
-    {0x9e20735e8cb16382L, 27},   {0xc5a890362fddbc62L, 30},
-    {0xf712b443bbd52b7bL, 33},   {0x9a6bb0aa55653b2dL, 37},
-    {0xc1069cd4eabe89f8L, 40},   {0xf148440a256e2c76L, 43},
-    {0x96cd2a865764dbcaL, 47},   {0xbc807527ed3e12bcL, 50},
-    {0xeba09271e88d976bL, 53},   {0x93445b8731587ea3L, 57},
-    {0xb8157268fdae9e4cL, 60},   {0xe61acf033d1a45dfL, 63},
-    {0x8fd0c16206306babL, 67},   {0xb3c4f1ba87bc8696L, 70},
-    {0xe0b62e2929aba83cL, 73},   {0x8c71dcd9ba0b4925L, 77},
-    {0xaf8e5410288e1b6fL, 80},   {0xdb71e91432b1a24aL, 83},
-    {0x892731ac9faf056eL, 87},   {0xab70fe17c79ac6caL, 90},
-    {0xd64d3d9db981787dL, 93},   {0x85f0468293f0eb4eL, 97},
-    {0xa76c582338ed2621L, 100},  {0xd1476e2c07286faaL, 103},
-    {0x82cca4db847945caL, 107},  {0xa37fce126597973cL, 110},
-    {0xcc5fc196fefd7d0cL, 113},  {0xff77b1fcbebcdc4fL, 116},
-    {0x9faacf3df73609b1L, 120},  {0xc795830d75038c1dL, 123},
-    {0xf97ae3d0d2446f25L, 126},  {0x9becce62836ac577L, 130},
-    {0xc2e801fb244576d5L, 133},  {0xf3a20279ed56d48aL, 136},
-    {0x9845418c345644d6L, 140},  {0xbe5691ef416bd60cL, 143},
-    {0xedec366b11c6cb8fL, 146},  {0x94b3a202eb1c3f39L, 150},
-    {0xb9e08a83a5e34f07L, 153},  {0xe858ad248f5c22c9L, 156},
-    {0x91376c36d99995beL, 160},  {0xb58547448ffffb2dL, 163},
-    {0xe2e69915b3fff9f9L, 166},  {0x8dd01fad907ffc3bL, 170},
-    {0xb1442798f49ffb4aL, 173},  {0xdd95317f31c7fa1dL, 176},
-    {0x8a7d3eef7f1cfc52L, 180},  {0xad1c8eab5ee43b66L, 183},
-    {0xd863b256369d4a40L, 186},  {0x873e4f75e2224e68L, 190},
-    {0xa90de3535aaae202L, 193},  {0xd3515c2831559a83L, 196},
-    {0x8412d9991ed58091L, 200},  {0xa5178fff668ae0b6L, 203},
-    {0xce5d73ff402d98e3L, 206},  {0x80fa687f881c7f8eL, 210},
-    {0xa139029f6a239f72L, 213},  {0xc987434744ac874eL, 216},
-    {0xfbe9141915d7a922L, 219},  {0x9d71ac8fada6c9b5L, 223},
-    {0xc4ce17b399107c22L, 226},  {0xf6019da07f549b2bL, 229},
-    {0x99c102844f94e0fbL, 233},  {0xc0314325637a1939L, 236},
-    {0xf03d93eebc589f88L, 239},  {0x96267c7535b763b5L, 243},
-    {0xbbb01b9283253ca2L, 246},  {0xea9c227723ee8bcbL, 249},
-    {0x92a1958a7675175fL, 253},  {0xb749faed14125d36L, 256},
-    {0xe51c79a85916f484L, 259},  {0x8f31cc0937ae58d2L, 263},
-    {0xb2fe3f0b8599ef07L, 266},  {0xdfbdcece67006ac9L, 269},
-    {0x8bd6a141006042bdL, 273},  {0xaecc49914078536dL, 276},
-    {0xda7f5bf590966848L, 279},  {0x888f99797a5e012dL, 283},
-    {0xaab37fd7d8f58178L, 286},  {0xd5605fcdcf32e1d6L, 289},
-    {0x855c3be0a17fcd26L, 293},  {0xa6b34ad8c9dfc06fL, 296},
-    {0xd0601d8efc57b08bL, 299},  {0x823c12795db6ce57L, 303},
-    {0xa2cb1717b52481edL, 306},  {0xcb7ddcdda26da268L, 309},
-    {0xfe5d54150b090b02L, 312},  {0x9efa548d26e5a6e1L, 316},
-    {0xc6b8e9b0709f109aL, 319},  {0xf867241c8cc6d4c0L, 322},
-    {0x9b407691d7fc44f8L, 326},  {0xc21094364dfb5636L, 329},
-    {0xf294b943e17a2bc4L, 332},  {0x979cf3ca6cec5b5aL, 336},
-    {0xbd8430bd08277231L, 339},  {0xece53cec4a314ebdL, 342},
-    {0x940f4613ae5ed136L, 346},  {0xb913179899f68584L, 349},
-    {0xe757dd7ec07426e5L, 352},  {0x9096ea6f3848984fL, 356},
-    {0xb4bca50b065abe63L, 359},  {0xe1ebce4dc7f16dfbL, 362},
-    {0x8d3360f09cf6e4bdL, 366},  {0xb080392cc4349decL, 369},
-    {0xdca04777f541c567L, 372},  {0x89e42caaf9491b60L, 376},
-    {0xac5d37d5b79b6239L, 379},  {0xd77485cb25823ac7L, 382},
-    {0x86a8d39ef77164bcL, 386},  {0xa8530886b54dbdebL, 389},
-    {0xd267caa862a12d66L, 392},  {0x8380dea93da4bc60L, 396},
-    {0xa46116538d0deb78L, 399},  {0xcd795be870516656L, 402},
-    {0x806bd9714632dff6L, 406},  {0xa086cfcd97bf97f3L, 409},
-    {0xc8a883c0fdaf7df0L, 412},  {0xfad2a4b13d1b5d6cL, 415},
-    {0x9cc3a6eec6311a63L, 419},  {0xc3f490aa77bd60fcL, 422},
-    {0xf4f1b4d515acb93bL, 425},  {0x991711052d8bf3c5L, 429},
-    {0xbf5cd54678eef0b6L, 432},  {0xef340a98172aace4L, 435},
-    {0x9580869f0e7aac0eL, 439},  {0xbae0a846d2195712L, 442},
-    {0xe998d258869facd7L, 445},  {0x91ff83775423cc06L, 449},
-    {0xb67f6455292cbf08L, 452},  {0xe41f3d6a7377eecaL, 455},
-    {0x8e938662882af53eL, 459},  {0xb23867fb2a35b28dL, 462},
-    {0xdec681f9f4c31f31L, 465},  {0x8b3c113c38f9f37eL, 469},
-    {0xae0b158b4738705eL, 472},  {0xd98ddaee19068c76L, 475},
-    {0x87f8a8d4cfa417c9L, 479},  {0xa9f6d30a038d1dbcL, 482},
-    {0xd47487cc8470652bL, 485},  {0x84c8d4dfd2c63f3bL, 489},
-    {0xa5fb0a17c777cf09L, 492},  {0xcf79cc9db955c2ccL, 495},
-    {0x81ac1fe293d599bfL, 499},  {0xa21727db38cb002fL, 502},
-    {0xca9cf1d206fdc03bL, 505},  {0xfd442e4688bd304aL, 508},
-    {0x9e4a9cec15763e2eL, 512},  {0xc5dd44271ad3cdbaL, 515},
-    {0xf7549530e188c128L, 518},  {0x9a94dd3e8cf578b9L, 522},
-    {0xc13a148e3032d6e7L, 525},  {0xf18899b1bc3f8ca1L, 528},
-    {0x96f5600f15a7b7e5L, 532},  {0xbcb2b812db11a5deL, 535},
-    {0xebdf661791d60f56L, 538},  {0x936b9fcebb25c995L, 542},
-    {0xb84687c269ef3bfbL, 545},  {0xe65829b3046b0afaL, 548},
-    {0x8ff71a0fe2c2e6dcL, 552},  {0xb3f4e093db73a093L, 555},
-    {0xe0f218b8d25088b8L, 558},  {0x8c974f7383725573L, 562},
-    {0xafbd2350644eeacfL, 565},  {0xdbac6c247d62a583L, 568},
-    {0x894bc396ce5da772L, 572},  {0xab9eb47c81f5114fL, 575},
-    {0xd686619ba27255a2L, 578},  {0x8613fd0145877585L, 582},
-    {0xa798fc4196e952e7L, 585},  {0xd17f3b51fca3a7a0L, 588},
-    {0x82ef85133de648c4L, 592},  {0xa3ab66580d5fdaf5L, 595},
-    {0xcc963fee10b7d1b3L, 598},  {0xffbbcfe994e5c61fL, 601},
-    {0x9fd561f1fd0f9bd3L, 605},  {0xc7caba6e7c5382c8L, 608},
-    {0xf9bd690a1b68637bL, 611},  {0x9c1661a651213e2dL, 615},
-    {0xc31bfa0fe5698db8L, 618},  {0xf3e2f893dec3f126L, 621},
-    {0x986ddb5c6b3a76b7L, 625},  {0xbe89523386091465L, 628},
-    {0xee2ba6c0678b597fL, 631},  {0x94db483840b717efL, 635},
-    {0xba121a4650e4ddebL, 638},  {0xe896a0d7e51e1566L, 641},
-    {0x915e2486ef32cd60L, 645},  {0xb5b5ada8aaff80b8L, 648},
-    {0xe3231912d5bf60e6L, 651},  {0x8df5efabc5979c8fL, 655},
-    {0xb1736b96b6fd83b3L, 658},  {0xddd0467c64bce4a0L, 661},
-    {0x8aa22c0dbef60ee4L, 665},  {0xad4ab7112eb3929dL, 668},
-    {0xd89d64d57a607744L, 671},  {0x87625f056c7c4a8bL, 675},
-    {0xa93af6c6c79b5d2dL, 678},  {0xd389b47879823479L, 681},
-    {0x843610cb4bf160cbL, 685},  {0xa54394fe1eedb8feL, 688},
-    {0xce947a3da6a9273eL, 691},  {0x811ccc668829b887L, 695},
-    {0xa163ff802a3426a8L, 698},  {0xc9bcff6034c13052L, 701},
-    {0xfc2c3f3841f17c67L, 704},  {0x9d9ba7832936edc0L, 708},
-    {0xc5029163f384a931L, 711},  {0xf64335bcf065d37dL, 714},
-    {0x99ea0196163fa42eL, 718},  {0xc06481fb9bcf8d39L, 721},
-    {0xf07da27a82c37088L, 724},  {0x964e858c91ba2655L, 728},
-    {0xbbe226efb628afeaL, 731},  {0xeadab0aba3b2dbe5L, 734},
-    {0x92c8ae6b464fc96fL, 738},  {0xb77ada0617e3bbcbL, 741},
-    {0xe55990879ddcaabdL, 744},  {0x8f57fa54c2a9eab6L, 748},
-    {0xb32df8e9f3546564L, 751},  {0xdff9772470297ebdL, 754},
-    {0x8bfbea76c619ef36L, 758},  {0xaefae51477a06b03L, 761},
-    {0xdab99e59958885c4L, 764},  {0x88b402f7fd75539bL, 768},
-    {0xaae103b5fcd2a881L, 771},  {0xd59944a37c0752a2L, 774},
-    {0x857fcae62d8493a5L, 778},  {0xa6dfbd9fb8e5b88eL, 781},
-    {0xd097ad07a71f26b2L, 784},  {0x825ecc24c873782fL, 788},
-    {0xa2f67f2dfa90563bL, 791},  {0xcbb41ef979346bcaL, 794},
-    {0xfea126b7d78186bcL, 797},  {0x9f24b832e6b0f436L, 801},
-    {0xc6ede63fa05d3143L, 804},  {0xf8a95fcf88747d94L, 807},
-    {0x9b69dbe1b548ce7cL, 811},  {0xc24452da229b021bL, 814},
-    {0xf2d56790ab41c2a2L, 817},  {0x97c560ba6b0919a5L, 821},
-    {0xbdb6b8e905cb600fL, 824},  {0xed246723473e3813L, 827},
-    {0x9436c0760c86e30bL, 831},  {0xb94470938fa89bceL, 834},
-    {0xe7958cb87392c2c2L, 837},  {0x90bd77f3483bb9b9L, 841},
-    {0xb4ecd5f01a4aa828L, 844},  {0xe2280b6c20dd5232L, 847},
-    {0x8d590723948a535fL, 851},  {0xb0af48ec79ace837L, 854},
-    {0xdcdb1b2798182244L, 857},  {0x8a08f0f8bf0f156bL, 861},
-    {0xac8b2d36eed2dac5L, 864},  {0xd7adf884aa879177L, 867},
-    {0x86ccbb52ea94baeaL, 871},  {0xa87fea27a539e9a5L, 874},
-    {0xd29fe4b18e88640eL, 877},  {0x83a3eeeef9153e89L, 881},
-    {0xa48ceaaab75a8e2bL, 884},  {0xcdb02555653131b6L, 887},
-    {0x808e17555f3ebf11L, 891},  {0xa0b19d2ab70e6ed6L, 894},
-    {0xc8de047564d20a8bL, 897},  {0xfb158592be068d2eL, 900},
-    {0x9ced737bb6c4183dL, 904},  {0xc428d05aa4751e4cL, 907},
-    {0xf53304714d9265dfL, 910},  {0x993fe2c6d07b7fabL, 914},
-    {0xbf8fdb78849a5f96L, 917},  {0xef73d256a5c0f77cL, 920},
-    {0x95a8637627989aadL, 924},  {0xbb127c53b17ec159L, 927},
-    {0xe9d71b689dde71afL, 930},  {0x9226712162ab070dL, 934},
-    {0xb6b00d69bb55c8d1L, 937},  {0xe45c10c42a2b3b05L, 940},
-    {0x8eb98a7a9a5b04e3L, 944},  {0xb267ed1940f1c61cL, 947},
-    {0xdf01e85f912e37a3L, 950},  {0x8b61313bbabce2c6L, 954},
-    {0xae397d8aa96c1b77L, 957},  {0xd9c7dced53c72255L, 960},
-    {0x881cea14545c7575L, 964},  {0xaa242499697392d2L, 967},
-    {0xd4ad2dbfc3d07787L, 970},  {0x84ec3c97da624ab4L, 974},
-    {0xa6274bbdd0fadd61L, 977},  {0xcfb11ead453994baL, 980},
-    {0x81ceb32c4b43fcf4L, 984},  {0xa2425ff75e14fc31L, 987},
-    {0xcad2f7f5359a3b3eL, 990},  {0xfd87b5f28300ca0dL, 993},
-    {0x9e74d1b791e07e48L, 997},  {0xc612062576589ddaL, 1000},
-    {0xf79687aed3eec551L, 1003}, {0x9abe14cd44753b52L, 1007},
-    {0xc16d9a0095928a27L, 1010}, {0xf1c90080baf72cb1L, 1013},
-    {0x971da05074da7beeL, 1017}, {0xbce5086492111aeaL, 1020},
-    {0xec1e4a7db69561a5L, 1023}, {0x9392ee8e921d5d07L, 1027},
-    {0xb877aa3236a4b449L, 1030}, {0xe69594bec44de15bL, 1033},
-    {0x901d7cf73ab0acd9L, 1037}, {0xb424dc35095cd80fL, 1040},
-    {0xe12e13424bb40e13L, 1043}, {0x8cbccc096f5088cbL, 1047},
-    {0xafebff0bcb24aafeL, 1050}, {0xdbe6fecebdedd5beL, 1053},
-    {0x89705f4136b4a597L, 1057}, {0xabcc77118461cefcL, 1060},
-    {0xd6bf94d5e57a42bcL, 1063}, {0x8637bd05af6c69b5L, 1067},
-    {0xa7c5ac471b478423L, 1070}, {0xd1b71758e219652bL, 1073},
-    {0x83126e978d4fdf3bL, 1077}, {0xa3d70a3d70a3d70aL, 1080},
-    {0xccccccccccccccccL, 1083}, {0x8000000000000000L, 1087},
-    {0xa000000000000000L, 1090}, {0xc800000000000000L, 1093},
-    {0xfa00000000000000L, 1096}, {0x9c40000000000000L, 1100},
-    {0xc350000000000000L, 1103}, {0xf424000000000000L, 1106},
-    {0x9896800000000000L, 1110}, {0xbebc200000000000L, 1113},
-    {0xee6b280000000000L, 1116}, {0x9502f90000000000L, 1120},
-    {0xba43b74000000000L, 1123}, {0xe8d4a51000000000L, 1126},
-    {0x9184e72a00000000L, 1130}, {0xb5e620f480000000L, 1133},
-    {0xe35fa931a0000000L, 1136}, {0x8e1bc9bf04000000L, 1140},
-    {0xb1a2bc2ec5000000L, 1143}, {0xde0b6b3a76400000L, 1146},
-    {0x8ac7230489e80000L, 1150}, {0xad78ebc5ac620000L, 1153},
-    {0xd8d726b7177a8000L, 1156}, {0x878678326eac9000L, 1160},
-    {0xa968163f0a57b400L, 1163}, {0xd3c21bcecceda100L, 1166},
-    {0x84595161401484a0L, 1170}, {0xa56fa5b99019a5c8L, 1173},
-    {0xcecb8f27f4200f3aL, 1176}, {0x813f3978f8940984L, 1180},
-    {0xa18f07d736b90be5L, 1183}, {0xc9f2c9cd04674edeL, 1186},
-    {0xfc6f7c4045812296L, 1189}, {0x9dc5ada82b70b59dL, 1193},
-    {0xc5371912364ce305L, 1196}, {0xf684df56c3e01bc6L, 1199},
-    {0x9a130b963a6c115cL, 1203}, {0xc097ce7bc90715b3L, 1206},
-    {0xf0bdc21abb48db20L, 1209}, {0x96769950b50d88f4L, 1213},
-    {0xbc143fa4e250eb31L, 1216}, {0xeb194f8e1ae525fdL, 1219},
-    {0x92efd1b8d0cf37beL, 1223}, {0xb7abc627050305adL, 1226},
-    {0xe596b7b0c643c719L, 1229}, {0x8f7e32ce7bea5c6fL, 1233},
-    {0xb35dbf821ae4f38bL, 1236}, {0xe0352f62a19e306eL, 1239},
-    {0x8c213d9da502de45L, 1243}, {0xaf298d050e4395d6L, 1246},
-    {0xdaf3f04651d47b4cL, 1249}, {0x88d8762bf324cd0fL, 1253},
-    {0xab0e93b6efee0053L, 1256}, {0xd5d238a4abe98068L, 1259},
-    {0x85a36366eb71f041L, 1263}, {0xa70c3c40a64e6c51L, 1266},
-    {0xd0cf4b50cfe20765L, 1269}, {0x82818f1281ed449fL, 1273},
-    {0xa321f2d7226895c7L, 1276}, {0xcbea6f8ceb02bb39L, 1279},
-    {0xfee50b7025c36a08L, 1282}, {0x9f4f2726179a2245L, 1286},
-    {0xc722f0ef9d80aad6L, 1289}, {0xf8ebad2b84e0d58bL, 1292},
-    {0x9b934c3b330c8577L, 1296}, {0xc2781f49ffcfa6d5L, 1299},
-    {0xf316271c7fc3908aL, 1302}, {0x97edd871cfda3a56L, 1306},
-    {0xbde94e8e43d0c8ecL, 1309}, {0xed63a231d4c4fb27L, 1312},
-    {0x945e455f24fb1cf8L, 1316}, {0xb975d6b6ee39e436L, 1319},
-    {0xe7d34c64a9c85d44L, 1322}, {0x90e40fbeea1d3a4aL, 1326},
-    {0xb51d13aea4a488ddL, 1329}, {0xe264589a4dcdab14L, 1332},
-    {0x8d7eb76070a08aecL, 1336}, {0xb0de65388cc8ada8L, 1339},
-    {0xdd15fe86affad912L, 1342}, {0x8a2dbf142dfcc7abL, 1346},
-    {0xacb92ed9397bf996L, 1349}, {0xd7e77a8f87daf7fbL, 1352},
-    {0x86f0ac99b4e8dafdL, 1356}, {0xa8acd7c0222311bcL, 1359},
-    {0xd2d80db02aabd62bL, 1362}, {0x83c7088e1aab65dbL, 1366},
-    {0xa4b8cab1a1563f52L, 1369}, {0xcde6fd5e09abcf26L, 1372},
-    {0x80b05e5ac60b6178L, 1376}, {0xa0dc75f1778e39d6L, 1379},
-    {0xc913936dd571c84cL, 1382}, {0xfb5878494ace3a5fL, 1385},
-    {0x9d174b2dcec0e47bL, 1389}, {0xc45d1df942711d9aL, 1392},
-    {0xf5746577930d6500L, 1395}, {0x9968bf6abbe85f20L, 1399},
-    {0xbfc2ef456ae276e8L, 1402}, {0xefb3ab16c59b14a2L, 1405},
-    {0x95d04aee3b80ece5L, 1409}, {0xbb445da9ca61281fL, 1412},
-    {0xea1575143cf97226L, 1415}, {0x924d692ca61be758L, 1419},
-    {0xb6e0c377cfa2e12eL, 1422}, {0xe498f455c38b997aL, 1425},
-    {0x8edf98b59a373fecL, 1429}, {0xb2977ee300c50fe7L, 1432},
-    {0xdf3d5e9bc0f653e1L, 1435}, {0x8b865b215899f46cL, 1439},
-    {0xae67f1e9aec07187L, 1442}, {0xda01ee641a708de9L, 1445},
-    {0x884134fe908658b2L, 1449}, {0xaa51823e34a7eedeL, 1452},
-    {0xd4e5e2cdc1d1ea96L, 1455}, {0x850fadc09923329eL, 1459},
-    {0xa6539930bf6bff45L, 1462}, {0xcfe87f7cef46ff16L, 1465},
-    {0x81f14fae158c5f6eL, 1469}, {0xa26da3999aef7749L, 1472},
-    {0xcb090c8001ab551cL, 1475}, {0xfdcb4fa002162a63L, 1478},
-    {0x9e9f11c4014dda7eL, 1482}, {0xc646d63501a1511dL, 1485},
-    {0xf7d88bc24209a565L, 1488}, {0x9ae757596946075fL, 1492},
-    {0xc1a12d2fc3978937L, 1495}, {0xf209787bb47d6b84L, 1498},
-    {0x9745eb4d50ce6332L, 1502}, {0xbd176620a501fbffL, 1505},
-    {0xec5d3fa8ce427affL, 1508}, {0x93ba47c980e98cdfL, 1512},
-    {0xb8a8d9bbe123f017L, 1515}, {0xe6d3102ad96cec1dL, 1518},
-    {0x9043ea1ac7e41392L, 1522}, {0xb454e4a179dd1877L, 1525},
-    {0xe16a1dc9d8545e94L, 1528}, {0x8ce2529e2734bb1dL, 1532},
-    {0xb01ae745b101e9e4L, 1535}, {0xdc21a1171d42645dL, 1538},
-    {0x899504ae72497ebaL, 1542}, {0xabfa45da0edbde69L, 1545},
-    {0xd6f8d7509292d603L, 1548}, {0x865b86925b9bc5c2L, 1552},
-    {0xa7f26836f282b732L, 1555}, {0xd1ef0244af2364ffL, 1558},
-    {0x8335616aed761f1fL, 1562}, {0xa402b9c5a8d3a6e7L, 1565},
-    {0xcd036837130890a1L, 1568}, {0x802221226be55a64L, 1572},
-    {0xa02aa96b06deb0fdL, 1575}, {0xc83553c5c8965d3dL, 1578},
-    {0xfa42a8b73abbf48cL, 1581}, {0x9c69a97284b578d7L, 1585},
-    {0xc38413cf25e2d70dL, 1588}, {0xf46518c2ef5b8cd1L, 1591},
-    {0x98bf2f79d5993802L, 1595}, {0xbeeefb584aff8603L, 1598},
-    {0xeeaaba2e5dbf6784L, 1601}, {0x952ab45cfa97a0b2L, 1605},
-    {0xba756174393d88dfL, 1608}, {0xe912b9d1478ceb17L, 1611},
-    {0x91abb422ccb812eeL, 1615}, {0xb616a12b7fe617aaL, 1618},
-    {0xe39c49765fdf9d94L, 1621}, {0x8e41ade9fbebc27dL, 1625},
-    {0xb1d219647ae6b31cL, 1628}, {0xde469fbd99a05fe3L, 1631},
-    {0x8aec23d680043beeL, 1635}, {0xada72ccc20054ae9L, 1638},
-    {0xd910f7ff28069da4L, 1641}, {0x87aa9aff79042286L, 1645},
-    {0xa99541bf57452b28L, 1648}, {0xd3fa922f2d1675f2L, 1651},
-    {0x847c9b5d7c2e09b7L, 1655}, {0xa59bc234db398c25L, 1658},
-    {0xcf02b2c21207ef2eL, 1661}, {0x8161afb94b44f57dL, 1665},
-    {0xa1ba1ba79e1632dcL, 1668}, {0xca28a291859bbf93L, 1671},
-    {0xfcb2cb35e702af78L, 1674}, {0x9defbf01b061adabL, 1678},
-    {0xc56baec21c7a1916L, 1681}, {0xf6c69a72a3989f5bL, 1684},
-    {0x9a3c2087a63f6399L, 1688}, {0xc0cb28a98fcf3c7fL, 1691},
-    {0xf0fdf2d3f3c30b9fL, 1694}, {0x969eb7c47859e743L, 1698},
-    {0xbc4665b596706114L, 1701}, {0xeb57ff22fc0c7959L, 1704},
-    {0x9316ff75dd87cbd8L, 1708}, {0xb7dcbf5354e9beceL, 1711},
-    {0xe5d3ef282a242e81L, 1714}, {0x8fa475791a569d10L, 1718},
-    {0xb38d92d760ec4455L, 1721}, {0xe070f78d3927556aL, 1724},
-    {0x8c469ab843b89562L, 1728}, {0xaf58416654a6babbL, 1731},
-    {0xdb2e51bfe9d0696aL, 1734}, {0x88fcf317f22241e2L, 1738},
-    {0xab3c2fddeeaad25aL, 1741}, {0xd60b3bd56a5586f1L, 1744},
-    {0x85c7056562757456L, 1748}, {0xa738c6bebb12d16cL, 1751},
-    {0xd106f86e69d785c7L, 1754}, {0x82a45b450226b39cL, 1758},
-    {0xa34d721642b06084L, 1761}, {0xcc20ce9bd35c78a5L, 1764},
-    {0xff290242c83396ceL, 1767}, {0x9f79a169bd203e41L, 1771},
-    {0xc75809c42c684dd1L, 1774}, {0xf92e0c3537826145L, 1777},
-    {0x9bbcc7a142b17ccbL, 1781}, {0xc2abf989935ddbfeL, 1784},
-    {0xf356f7ebf83552feL, 1787}, {0x98165af37b2153deL, 1791},
-    {0xbe1bf1b059e9a8d6L, 1794}, {0xeda2ee1c7064130cL, 1797},
-    {0x9485d4d1c63e8be7L, 1801}, {0xb9a74a0637ce2ee1L, 1804},
-    {0xe8111c87c5c1ba99L, 1807}, {0x910ab1d4db9914a0L, 1811},
-    {0xb54d5e4a127f59c8L, 1814}, {0xe2a0b5dc971f303aL, 1817},
-    {0x8da471a9de737e24L, 1821}, {0xb10d8e1456105dadL, 1824},
-    {0xdd50f1996b947518L, 1827}, {0x8a5296ffe33cc92fL, 1831},
-    {0xace73cbfdc0bfb7bL, 1834}, {0xd8210befd30efa5aL, 1837},
-    {0x8714a775e3e95c78L, 1841}, {0xa8d9d1535ce3b396L, 1844},
-    {0xd31045a8341ca07cL, 1847}, {0x83ea2b892091e44dL, 1851},
-    {0xa4e4b66b68b65d60L, 1854}, {0xce1de40642e3f4b9L, 1857},
-    {0x80d2ae83e9ce78f3L, 1861}, {0xa1075a24e4421730L, 1864},
-    {0xc94930ae1d529cfcL, 1867}, {0xfb9b7cd9a4a7443cL, 1870},
-    {0x9d412e0806e88aa5L, 1874}, {0xc491798a08a2ad4eL, 1877},
-    {0xf5b5d7ec8acb58a2L, 1880}, {0x9991a6f3d6bf1765L, 1884},
-    {0xbff610b0cc6edd3fL, 1887}, {0xeff394dcff8a948eL, 1890},
-    {0x95f83d0a1fb69cd9L, 1894}, {0xbb764c4ca7a4440fL, 1897},
-    {0xea53df5fd18d5513L, 1900}, {0x92746b9be2f8552cL, 1904},
-    {0xb7118682dbb66a77L, 1907}, {0xe4d5e82392a40515L, 1910},
-    {0x8f05b1163ba6832dL, 1914}, {0xb2c71d5bca9023f8L, 1917},
-    {0xdf78e4b2bd342cf6L, 1920}, {0x8bab8eefb6409c1aL, 1924},
-    {0xae9672aba3d0c320L, 1927}, {0xda3c0f568cc4f3e8L, 1930},
-    {0x8865899617fb1871L, 1934}, {0xaa7eebfb9df9de8dL, 1937},
-    {0xd51ea6fa85785631L, 1940}, {0x8533285c936b35deL, 1944},
-    {0xa67ff273b8460356L, 1947}, {0xd01fef10a657842cL, 1950},
-    {0x8213f56a67f6b29bL, 1954}, {0xa298f2c501f45f42L, 1957},
-    {0xcb3f2f7642717713L, 1960}, {0xfe0efb53d30dd4d7L, 1963},
-    {0x9ec95d1463e8a506L, 1967}, {0xc67bb4597ce2ce48L, 1970},
-    {0xf81aa16fdc1b81daL, 1973}, {0x9b10a4e5e9913128L, 1977},
-    {0xc1d4ce1f63f57d72L, 1980}, {0xf24a01a73cf2dccfL, 1983},
-    {0x976e41088617ca01L, 1987}, {0xbd49d14aa79dbc82L, 1990},
-    {0xec9c459d51852ba2L, 1993}, {0x93e1ab8252f33b45L, 1997},
-    {0xb8da1662e7b00a17L, 2000}, {0xe7109bfba19c0c9dL, 2003},
-    {0x906a617d450187e2L, 2007}, {0xb484f9dc9641e9daL, 2010},
-    {0xe1a63853bbd26451L, 2013}, {0x8d07e33455637eb2L, 2017},
-    {0xb049dc016abc5e5fL, 2020}, {0xdc5c5301c56b75f7L, 2023},
-    {0x89b9b3e11b6329baL, 2027}, {0xac2820d9623bf429L, 2030},
-    {0xd732290fbacaf133L, 2033}, {0x867f59a9d4bed6c0L, 2037},
-    {0xa81f301449ee8c70L, 2040}, {0xd226fc195c6a2f8cL, 2043},
-    {0x83585d8fd9c25db7L, 2047}, {0xa42e74f3d032f525L, 2050},
-    {0xcd3a1230c43fb26fL, 2053}, {0x80444b5e7aa7cf85L, 2057},
-    {0xa0555e361951c366L, 2060}, {0xc86ab5c39fa63440L, 2063},
-    {0xfa856334878fc150L, 2066}, {0x9c935e00d4b9d8d2L, 2070},
-    {0xc3b8358109e84f07L, 2073}, {0xf4a642e14c6262c8L, 2076},
-    {0x98e7e9cccfbd7dbdL, 2080}, {0xbf21e44003acdd2cL, 2083},
-    {0xeeea5d5004981478L, 2086}, {0x95527a5202df0ccbL, 2090},
-    {0xbaa718e68396cffdL, 2093}, {0xe950df20247c83fdL, 2096},
-    {0x91d28b7416cdd27eL, 2100}, {0xb6472e511c81471dL, 2103},
-    {0xe3d8f9e563a198e5L, 2106}, {0x8e679c2f5e44ff8fL, 2110}};
+      template <typename T>
+      struct base8_numeric : base8<T>
+      {
+        static really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
+        static really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
+        static really_inline simd8<T> load(const T values[32])
+        {
+          return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+        }
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        static really_inline simd8<T> repeat_16(
+            T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+            T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+        {
+          return simd8<T>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15,
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
+        }
 
-// A complement from power_of_ten_components
-// complete to a 128-bit mantissa.
-const uint64_t mantissa_128[] = {0x419ea3bd35385e2d,
-                                 0x52064cac828675b9,
-                                 0x7343efebd1940993,
-                                 0x1014ebe6c5f90bf8,
-                                 0xd41a26e077774ef6,
-                                 0x8920b098955522b4,
-                                 0x55b46e5f5d5535b0,
-                                 0xeb2189f734aa831d,
-                                 0xa5e9ec7501d523e4,
-                                 0x47b233c92125366e,
-                                 0x999ec0bb696e840a,
-                                 0xc00670ea43ca250d,
-                                 0x380406926a5e5728,
-                                 0xc605083704f5ecf2,
-                                 0xf7864a44c633682e,
-                                 0x7ab3ee6afbe0211d,
-                                 0x5960ea05bad82964,
-                                 0x6fb92487298e33bd,
-                                 0xa5d3b6d479f8e056,
-                                 0x8f48a4899877186c,
-                                 0x331acdabfe94de87,
-                                 0x9ff0c08b7f1d0b14,
-                                 0x7ecf0ae5ee44dd9,
-                                 0xc9e82cd9f69d6150,
-                                 0xbe311c083a225cd2,
-                                 0x6dbd630a48aaf406,
-                                 0x92cbbccdad5b108,
-                                 0x25bbf56008c58ea5,
-                                 0xaf2af2b80af6f24e,
-                                 0x1af5af660db4aee1,
-                                 0x50d98d9fc890ed4d,
-                                 0xe50ff107bab528a0,
-                                 0x1e53ed49a96272c8,
-                                 0x25e8e89c13bb0f7a,
-                                 0x77b191618c54e9ac,
-                                 0xd59df5b9ef6a2417,
-                                 0x4b0573286b44ad1d,
-                                 0x4ee367f9430aec32,
-                                 0x229c41f793cda73f,
-                                 0x6b43527578c1110f,
-                                 0x830a13896b78aaa9,
-                                 0x23cc986bc656d553,
-                                 0x2cbfbe86b7ec8aa8,
-                                 0x7bf7d71432f3d6a9,
-                                 0xdaf5ccd93fb0cc53,
-                                 0xd1b3400f8f9cff68,
-                                 0x23100809b9c21fa1,
-                                 0xabd40a0c2832a78a,
-                                 0x16c90c8f323f516c,
-                                 0xae3da7d97f6792e3,
-                                 0x99cd11cfdf41779c,
-                                 0x40405643d711d583,
-                                 0x482835ea666b2572,
-                                 0xda3243650005eecf,
-                                 0x90bed43e40076a82,
-                                 0x5a7744a6e804a291,
-                                 0x711515d0a205cb36,
-                                 0xd5a5b44ca873e03,
-                                 0xe858790afe9486c2,
-                                 0x626e974dbe39a872,
-                                 0xfb0a3d212dc8128f,
-                                 0x7ce66634bc9d0b99,
-                                 0x1c1fffc1ebc44e80,
-                                 0xa327ffb266b56220,
-                                 0x4bf1ff9f0062baa8,
-                                 0x6f773fc3603db4a9,
-                                 0xcb550fb4384d21d3,
-                                 0x7e2a53a146606a48,
-                                 0x2eda7444cbfc426d,
-                                 0xfa911155fefb5308,
-                                 0x793555ab7eba27ca,
-                                 0x4bc1558b2f3458de,
-                                 0x9eb1aaedfb016f16,
-                                 0x465e15a979c1cadc,
-                                 0xbfacd89ec191ec9,
-                                 0xcef980ec671f667b,
-                                 0x82b7e12780e7401a,
-                                 0xd1b2ecb8b0908810,
-                                 0x861fa7e6dcb4aa15,
-                                 0x67a791e093e1d49a,
-                                 0xe0c8bb2c5c6d24e0,
-                                 0x58fae9f773886e18,
-                                 0xaf39a475506a899e,
-                                 0x6d8406c952429603,
-                                 0xc8e5087ba6d33b83,
-                                 0xfb1e4a9a90880a64,
-                                 0x5cf2eea09a55067f,
-                                 0xf42faa48c0ea481e,
-                                 0xf13b94daf124da26,
-                                 0x76c53d08d6b70858,
-                                 0x54768c4b0c64ca6e,
-                                 0xa9942f5dcf7dfd09,
-                                 0xd3f93b35435d7c4c,
-                                 0xc47bc5014a1a6daf,
-                                 0x359ab6419ca1091b,
-                                 0xc30163d203c94b62,
-                                 0x79e0de63425dcf1d,
-                                 0x985915fc12f542e4,
-                                 0x3e6f5b7b17b2939d,
-                                 0xa705992ceecf9c42,
-                                 0x50c6ff782a838353,
-                                 0xa4f8bf5635246428,
-                                 0x871b7795e136be99,
-                                 0x28e2557b59846e3f,
-                                 0x331aeada2fe589cf,
-                                 0x3ff0d2c85def7621,
-                                 0xfed077a756b53a9,
-                                 0xd3e8495912c62894,
-                                 0x64712dd7abbbd95c,
-                                 0xbd8d794d96aacfb3,
-                                 0xecf0d7a0fc5583a0,
-                                 0xf41686c49db57244,
-                                 0x311c2875c522ced5,
-                                 0x7d633293366b828b,
-                                 0xae5dff9c02033197,
-                                 0xd9f57f830283fdfc,
-                                 0xd072df63c324fd7b,
-                                 0x4247cb9e59f71e6d,
-                                 0x52d9be85f074e608,
-                                 0x67902e276c921f8b,
-                                 0xba1cd8a3db53b6,
-                                 0x80e8a40eccd228a4,
-                                 0x6122cd128006b2cd,
-                                 0x796b805720085f81,
-                                 0xcbe3303674053bb0,
-                                 0xbedbfc4411068a9c,
-                                 0xee92fb5515482d44,
-                                 0x751bdd152d4d1c4a,
-                                 0xd262d45a78a0635d,
-                                 0x86fb897116c87c34,
-                                 0xd45d35e6ae3d4da0,
-                                 0x8974836059cca109,
-                                 0x2bd1a438703fc94b,
-                                 0x7b6306a34627ddcf,
-                                 0x1a3bc84c17b1d542,
-                                 0x20caba5f1d9e4a93,
-                                 0x547eb47b7282ee9c,
-                                 0xe99e619a4f23aa43,
-                                 0x6405fa00e2ec94d4,
-                                 0xde83bc408dd3dd04,
-                                 0x9624ab50b148d445,
-                                 0x3badd624dd9b0957,
-                                 0xe54ca5d70a80e5d6,
-                                 0x5e9fcf4ccd211f4c,
-                                 0x7647c3200069671f,
-                                 0x29ecd9f40041e073,
-                                 0xf468107100525890,
-                                 0x7182148d4066eeb4,
-                                 0xc6f14cd848405530,
-                                 0xb8ada00e5a506a7c,
-                                 0xa6d90811f0e4851c,
-                                 0x908f4a166d1da663,
-                                 0x9a598e4e043287fe,
-                                 0x40eff1e1853f29fd,
-                                 0xd12bee59e68ef47c,
-                                 0x82bb74f8301958ce,
-                                 0xe36a52363c1faf01,
-                                 0xdc44e6c3cb279ac1,
-                                 0x29ab103a5ef8c0b9,
-                                 0x7415d448f6b6f0e7,
-                                 0x111b495b3464ad21,
-                                 0xcab10dd900beec34,
-                                 0x3d5d514f40eea742,
-                                 0xcb4a5a3112a5112,
-                                 0x47f0e785eaba72ab,
-                                 0x59ed216765690f56,
-                                 0x306869c13ec3532c,
-                                 0x1e414218c73a13fb,
-                                 0xe5d1929ef90898fa,
-                                 0xdf45f746b74abf39,
-                                 0x6b8bba8c328eb783,
-                                 0x66ea92f3f326564,
-                                 0xc80a537b0efefebd,
-                                 0xbd06742ce95f5f36,
-                                 0x2c48113823b73704,
-                                 0xf75a15862ca504c5,
-                                 0x9a984d73dbe722fb,
-                                 0xc13e60d0d2e0ebba,
-                                 0x318df905079926a8,
-                                 0xfdf17746497f7052,
-                                 0xfeb6ea8bedefa633,
-                                 0xfe64a52ee96b8fc0,
-                                 0x3dfdce7aa3c673b0,
-                                 0x6bea10ca65c084e,
-                                 0x486e494fcff30a62,
-                                 0x5a89dba3c3efccfa,
-                                 0xf89629465a75e01c,
-                                 0xf6bbb397f1135823,
-                                 0x746aa07ded582e2c,
-                                 0xa8c2a44eb4571cdc,
-                                 0x92f34d62616ce413,
-                                 0x77b020baf9c81d17,
-                                 0xace1474dc1d122e,
-                                 0xd819992132456ba,
-                                 0x10e1fff697ed6c69,
-                                 0xca8d3ffa1ef463c1,
-                                 0xbd308ff8a6b17cb2,
-                                 0xac7cb3f6d05ddbde,
-                                 0x6bcdf07a423aa96b,
-                                 0x86c16c98d2c953c6,
-                                 0xe871c7bf077ba8b7,
-                                 0x11471cd764ad4972,
-                                 0xd598e40d3dd89bcf,
-                                 0x4aff1d108d4ec2c3,
-                                 0xcedf722a585139ba,
-                                 0xc2974eb4ee658828,
-                                 0x733d226229feea32,
-                                 0x806357d5a3f525f,
-                                 0xca07c2dcb0cf26f7,
-                                 0xfc89b393dd02f0b5,
-                                 0xbbac2078d443ace2,
-                                 0xd54b944b84aa4c0d,
-                                 0xa9e795e65d4df11,
-                                 0x4d4617b5ff4a16d5,
-                                 0x504bced1bf8e4e45,
-                                 0xe45ec2862f71e1d6,
-                                 0x5d767327bb4e5a4c,
-                                 0x3a6a07f8d510f86f,
-                                 0x890489f70a55368b,
-                                 0x2b45ac74ccea842e,
-                                 0x3b0b8bc90012929d,
-                                 0x9ce6ebb40173744,
-                                 0xcc420a6a101d0515,
-                                 0x9fa946824a12232d,
-                                 0x47939822dc96abf9,
-                                 0x59787e2b93bc56f7,
-                                 0x57eb4edb3c55b65a,
-                                 0xede622920b6b23f1,
-                                 0xe95fab368e45eced,
-                                 0x11dbcb0218ebb414,
-                                 0xd652bdc29f26a119,
-                                 0x4be76d3346f0495f,
-                                 0x6f70a4400c562ddb,
-                                 0xcb4ccd500f6bb952,
-                                 0x7e2000a41346a7a7,
-                                 0x8ed400668c0c28c8,
-                                 0x728900802f0f32fa,
-                                 0x4f2b40a03ad2ffb9,
-                                 0xe2f610c84987bfa8,
-                                 0xdd9ca7d2df4d7c9,
-                                 0x91503d1c79720dbb,
-                                 0x75a44c6397ce912a,
-                                 0xc986afbe3ee11aba,
-                                 0xfbe85badce996168,
-                                 0xfae27299423fb9c3,
-                                 0xdccd879fc967d41a,
-                                 0x5400e987bbc1c920,
-                                 0x290123e9aab23b68,
-                                 0xf9a0b6720aaf6521,
-                                 0xf808e40e8d5b3e69,
-                                 0xb60b1d1230b20e04,
-                                 0xb1c6f22b5e6f48c2,
-                                 0x1e38aeb6360b1af3,
-                                 0x25c6da63c38de1b0,
-                                 0x579c487e5a38ad0e,
-                                 0x2d835a9df0c6d851,
-                                 0xf8e431456cf88e65,
-                                 0x1b8e9ecb641b58ff,
-                                 0xe272467e3d222f3f,
-                                 0x5b0ed81dcc6abb0f,
-                                 0x98e947129fc2b4e9,
-                                 0x3f2398d747b36224,
-                                 0x8eec7f0d19a03aad,
-                                 0x1953cf68300424ac,
-                                 0x5fa8c3423c052dd7,
-                                 0x3792f412cb06794d,
-                                 0xe2bbd88bbee40bd0,
-                                 0x5b6aceaeae9d0ec4,
-                                 0xf245825a5a445275,
-                                 0xeed6e2f0f0d56712,
-                                 0x55464dd69685606b,
-                                 0xaa97e14c3c26b886,
-                                 0xd53dd99f4b3066a8,
-                                 0xe546a8038efe4029,
-                                 0xde98520472bdd033,
-                                 0x963e66858f6d4440,
-                                 0xdde7001379a44aa8,
-                                 0x5560c018580d5d52,
-                                 0xaab8f01e6e10b4a6,
-                                 0xcab3961304ca70e8,
-                                 0x3d607b97c5fd0d22,
-                                 0x8cb89a7db77c506a,
-                                 0x77f3608e92adb242,
-                                 0x55f038b237591ed3,
-                                 0x6b6c46dec52f6688,
-                                 0x2323ac4b3b3da015,
-                                 0xabec975e0a0d081a,
-                                 0x96e7bd358c904a21,
-                                 0x7e50d64177da2e54,
-                                 0xdde50bd1d5d0b9e9,
-                                 0x955e4ec64b44e864,
-                                 0xbd5af13bef0b113e,
-                                 0xecb1ad8aeacdd58e,
-                                 0x67de18eda5814af2,
-                                 0x80eacf948770ced7,
-                                 0xa1258379a94d028d,
-                                 0x96ee45813a04330,
-                                 0x8bca9d6e188853fc,
-                                 0x775ea264cf55347d,
-                                 0x95364afe032a819d,
-                                 0x3a83ddbd83f52204,
-                                 0xc4926a9672793542,
-                                 0x75b7053c0f178293,
-                                 0x5324c68b12dd6338,
-                                 0xd3f6fc16ebca5e03,
-                                 0x88f4bb1ca6bcf584,
-                                 0x2b31e9e3d06c32e5,
-                                 0x3aff322e62439fcf,
-                                 0x9befeb9fad487c2,
-                                 0x4c2ebe687989a9b3,
-                                 0xf9d37014bf60a10,
-                                 0x538484c19ef38c94,
-                                 0x2865a5f206b06fb9,
-                                 0xf93f87b7442e45d3,
-                                 0xf78f69a51539d748,
-                                 0xb573440e5a884d1b,
-                                 0x31680a88f8953030,
-                                 0xfdc20d2b36ba7c3d,
-                                 0x3d32907604691b4c,
-                                 0xa63f9a49c2c1b10f,
-                                 0xfcf80dc33721d53,
-                                 0xd3c36113404ea4a8,
-                                 0x645a1cac083126e9,
-                                 0x3d70a3d70a3d70a3,
-                                 0xcccccccccccccccc,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x0,
-                                 0x4000000000000000,
-                                 0x5000000000000000,
-                                 0xa400000000000000,
-                                 0x4d00000000000000,
-                                 0xf020000000000000,
-                                 0x6c28000000000000,
-                                 0xc732000000000000,
-                                 0x3c7f400000000000,
-                                 0x4b9f100000000000,
-                                 0x1e86d40000000000,
-                                 0x1314448000000000,
-                                 0x17d955a000000000,
-                                 0x5dcfab0800000000,
-                                 0x5aa1cae500000000,
-                                 0xf14a3d9e40000000,
-                                 0x6d9ccd05d0000000,
-                                 0xe4820023a2000000,
-                                 0xdda2802c8a800000,
-                                 0xd50b2037ad200000,
-                                 0x4526f422cc340000,
-                                 0x9670b12b7f410000,
-                                 0x3c0cdd765f114000,
-                                 0xa5880a69fb6ac800,
-                                 0x8eea0d047a457a00,
-                                 0x72a4904598d6d880,
-                                 0x47a6da2b7f864750,
-                                 0x999090b65f67d924,
-                                 0xfff4b4e3f741cf6d,
-                                 0xbff8f10e7a8921a4,
-                                 0xaff72d52192b6a0d,
-                                 0x9bf4f8a69f764490,
-                                 0x2f236d04753d5b4,
-                                 0x1d762422c946590,
-                                 0x424d3ad2b7b97ef5,
-                                 0xd2e0898765a7deb2,
-                                 0x63cc55f49f88eb2f,
-                                 0x3cbf6b71c76b25fb,
-                                 0x8bef464e3945ef7a,
-                                 0x97758bf0e3cbb5ac,
-                                 0x3d52eeed1cbea317,
-                                 0x4ca7aaa863ee4bdd,
-                                 0x8fe8caa93e74ef6a,
-                                 0xb3e2fd538e122b44,
-                                 0x60dbbca87196b616,
-                                 0xbc8955e946fe31cd,
-                                 0x6babab6398bdbe41,
-                                 0xc696963c7eed2dd1,
-                                 0xfc1e1de5cf543ca2,
-                                 0x3b25a55f43294bcb,
-                                 0x49ef0eb713f39ebe,
-                                 0x6e3569326c784337,
-                                 0x49c2c37f07965404,
-                                 0xdc33745ec97be906,
-                                 0x69a028bb3ded71a3,
-                                 0xc40832ea0d68ce0c,
-                                 0xf50a3fa490c30190,
-                                 0x792667c6da79e0fa,
-                                 0x577001b891185938,
-                                 0xed4c0226b55e6f86,
-                                 0x544f8158315b05b4,
-                                 0x696361ae3db1c721,
-                                 0x3bc3a19cd1e38e9,
-                                 0x4ab48a04065c723,
-                                 0x62eb0d64283f9c76,
-                                 0x3ba5d0bd324f8394,
-                                 0xca8f44ec7ee36479,
-                                 0x7e998b13cf4e1ecb,
-                                 0x9e3fedd8c321a67e,
-                                 0xc5cfe94ef3ea101e,
-                                 0xbba1f1d158724a12,
-                                 0x2a8a6e45ae8edc97,
-                                 0xf52d09d71a3293bd,
-                                 0x593c2626705f9c56,
-                                 0x6f8b2fb00c77836c,
-                                 0xb6dfb9c0f956447,
-                                 0x4724bd4189bd5eac,
-                                 0x58edec91ec2cb657,
-                                 0x2f2967b66737e3ed,
-                                 0xbd79e0d20082ee74,
-                                 0xecd8590680a3aa11,
-                                 0xe80e6f4820cc9495,
-                                 0x3109058d147fdcdd,
-                                 0xbd4b46f0599fd415,
-                                 0x6c9e18ac7007c91a,
-                                 0x3e2cf6bc604ddb0,
-                                 0x84db8346b786151c,
-                                 0xe612641865679a63,
-                                 0x4fcb7e8f3f60c07e,
-                                 0xe3be5e330f38f09d,
-                                 0x5cadf5bfd3072cc5,
-                                 0x73d9732fc7c8f7f6,
-                                 0x2867e7fddcdd9afa,
-                                 0xb281e1fd541501b8,
-                                 0x1f225a7ca91a4226,
-                                 0x3375788de9b06958,
-                                 0x52d6b1641c83ae,
-                                 0xc0678c5dbd23a49a,
-                                 0xf840b7ba963646e0,
-                                 0xb650e5a93bc3d898,
-                                 0xa3e51f138ab4cebe,
-                                 0xc66f336c36b10137,
-                                 0xb80b0047445d4184,
-                                 0xa60dc059157491e5,
-                                 0x87c89837ad68db2f,
-                                 0x29babe4598c311fb,
-                                 0xf4296dd6fef3d67a,
-                                 0x1899e4a65f58660c,
-                                 0x5ec05dcff72e7f8f,
-                                 0x76707543f4fa1f73,
-                                 0x6a06494a791c53a8,
-                                 0x487db9d17636892,
-                                 0x45a9d2845d3c42b6,
-                                 0xb8a2392ba45a9b2,
-                                 0x8e6cac7768d7141e,
-                                 0x3207d795430cd926,
-                                 0x7f44e6bd49e807b8,
-                                 0x5f16206c9c6209a6,
-                                 0x36dba887c37a8c0f,
-                                 0xc2494954da2c9789,
-                                 0xf2db9baa10b7bd6c,
-                                 0x6f92829494e5acc7,
-                                 0xcb772339ba1f17f9,
-                                 0xff2a760414536efb,
-                                 0xfef5138519684aba,
-                                 0x7eb258665fc25d69,
-                                 0xef2f773ffbd97a61,
-                                 0xaafb550ffacfd8fa,
-                                 0x95ba2a53f983cf38,
-                                 0xdd945a747bf26183,
-                                 0x94f971119aeef9e4,
-                                 0x7a37cd5601aab85d,
-                                 0xac62e055c10ab33a,
-                                 0x577b986b314d6009,
-                                 0xed5a7e85fda0b80b,
-                                 0x14588f13be847307,
-                                 0x596eb2d8ae258fc8,
-                                 0x6fca5f8ed9aef3bb,
-                                 0x25de7bb9480d5854,
-                                 0xaf561aa79a10ae6a,
-                                 0x1b2ba1518094da04,
-                                 0x90fb44d2f05d0842,
-                                 0x353a1607ac744a53,
-                                 0x42889b8997915ce8,
-                                 0x69956135febada11,
-                                 0x43fab9837e699095,
-                                 0x94f967e45e03f4bb,
-                                 0x1d1be0eebac278f5,
-                                 0x6462d92a69731732,
-                                 0x7d7b8f7503cfdcfe,
-                                 0x5cda735244c3d43e,
-                                 0x3a0888136afa64a7,
-                                 0x88aaa1845b8fdd0,
-                                 0x8aad549e57273d45,
-                                 0x36ac54e2f678864b,
-                                 0x84576a1bb416a7dd,
-                                 0x656d44a2a11c51d5,
-                                 0x9f644ae5a4b1b325,
-                                 0x873d5d9f0dde1fee,
-                                 0xa90cb506d155a7ea,
-                                 0x9a7f12442d588f2,
-                                 0xc11ed6d538aeb2f,
-                                 0x8f1668c8a86da5fa,
-                                 0xf96e017d694487bc,
-                                 0x37c981dcc395a9ac,
-                                 0x85bbe253f47b1417,
-                                 0x93956d7478ccec8e,
-                                 0x387ac8d1970027b2,
-                                 0x6997b05fcc0319e,
-                                 0x441fece3bdf81f03,
-                                 0xd527e81cad7626c3,
-                                 0x8a71e223d8d3b074,
-                                 0xf6872d5667844e49,
-                                 0xb428f8ac016561db,
-                                 0xe13336d701beba52,
-                                 0xecc0024661173473,
-                                 0x27f002d7f95d0190,
-                                 0x31ec038df7b441f4,
-                                 0x7e67047175a15271,
-                                 0xf0062c6e984d386,
-                                 0x52c07b78a3e60868,
-                                 0xa7709a56ccdf8a82,
-                                 0x88a66076400bb691,
-                                 0x6acff893d00ea435,
-                                 0x583f6b8c4124d43,
-                                 0xc3727a337a8b704a,
-                                 0x744f18c0592e4c5c,
-                                 0x1162def06f79df73,
-                                 0x8addcb5645ac2ba8,
-                                 0x6d953e2bd7173692,
-                                 0xc8fa8db6ccdd0437,
-                                 0x1d9c9892400a22a2,
-                                 0x2503beb6d00cab4b,
-                                 0x2e44ae64840fd61d,
-                                 0x5ceaecfed289e5d2,
-                                 0x7425a83e872c5f47,
-                                 0xd12f124e28f77719,
-                                 0x82bd6b70d99aaa6f,
-                                 0x636cc64d1001550b,
-                                 0x3c47f7e05401aa4e,
-                                 0x65acfaec34810a71,
-                                 0x7f1839a741a14d0d,
-                                 0x1ede48111209a050,
-                                 0x934aed0aab460432,
-                                 0xf81da84d5617853f,
-                                 0x36251260ab9d668e,
-                                 0xc1d72b7c6b426019,
-                                 0xb24cf65b8612f81f,
-                                 0xdee033f26797b627,
-                                 0x169840ef017da3b1,
-                                 0x8e1f289560ee864e,
-                                 0xf1a6f2bab92a27e2,
-                                 0xae10af696774b1db,
-                                 0xacca6da1e0a8ef29,
-                                 0x17fd090a58d32af3,
-                                 0xddfc4b4cef07f5b0,
-                                 0x4abdaf101564f98e,
-                                 0x9d6d1ad41abe37f1,
-                                 0x84c86189216dc5ed,
-                                 0x32fd3cf5b4e49bb4,
-                                 0x3fbc8c33221dc2a1,
-                                 0xfabaf3feaa5334a,
-                                 0x29cb4d87f2a7400e,
-                                 0x743e20e9ef511012,
-                                 0x914da9246b255416,
-                                 0x1ad089b6c2f7548e,
-                                 0xa184ac2473b529b1,
-                                 0xc9e5d72d90a2741e,
-                                 0x7e2fa67c7a658892,
-                                 0xddbb901b98feeab7,
-                                 0x552a74227f3ea565,
-                                 0xd53a88958f87275f,
-                                 0x8a892abaf368f137,
-                                 0x2d2b7569b0432d85,
-                                 0x9c3b29620e29fc73,
-                                 0x8349f3ba91b47b8f,
-                                 0x241c70a936219a73,
-                                 0xed238cd383aa0110,
-                                 0xf4363804324a40aa,
-                                 0xb143c6053edcd0d5,
-                                 0xdd94b7868e94050a,
-                                 0xca7cf2b4191c8326,
-                                 0xfd1c2f611f63a3f0,
-                                 0xbc633b39673c8cec,
-                                 0xd5be0503e085d813,
-                                 0x4b2d8644d8a74e18,
-                                 0xddf8e7d60ed1219e,
-                                 0xcabb90e5c942b503,
-                                 0x3d6a751f3b936243,
-                                 0xcc512670a783ad4,
-                                 0x27fb2b80668b24c5,
-                                 0xb1f9f660802dedf6,
-                                 0x5e7873f8a0396973,
-                                 0xdb0b487b6423e1e8,
-                                 0x91ce1a9a3d2cda62,
-                                 0x7641a140cc7810fb,
-                                 0xa9e904c87fcb0a9d,
-                                 0x546345fa9fbdcd44,
-                                 0xa97c177947ad4095,
-                                 0x49ed8eabcccc485d,
-                                 0x5c68f256bfff5a74,
-                                 0x73832eec6fff3111,
-                                 0xc831fd53c5ff7eab,
-                                 0xba3e7ca8b77f5e55,
-                                 0x28ce1bd2e55f35eb,
-                                 0x7980d163cf5b81b3,
-                                 0xd7e105bcc332621f,
-                                 0x8dd9472bf3fefaa7,
-                                 0xb14f98f6f0feb951,
-                                 0x6ed1bf9a569f33d3,
-                                 0xa862f80ec4700c8,
-                                 0xcd27bb612758c0fa,
-                                 0x8038d51cb897789c,
-                                 0xe0470a63e6bd56c3,
-                                 0x1858ccfce06cac74,
-                                 0xf37801e0c43ebc8,
-                                 0xd30560258f54e6ba,
-                                 0x47c6b82ef32a2069,
-                                 0x4cdc331d57fa5441,
-                                 0xe0133fe4adf8e952,
-                                 0x58180fddd97723a6,
-                                 0x570f09eaa7ea7648};
+        really_inline base8_numeric() : base8<T>() {}
+        really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
 
+        // Store to array
+        really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
 
-} // namespace simdjson
+        // Addition/subtraction are the same for signed and unsigned
+        really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
+        really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
+        really_inline simd8<T> &operator+=(const simd8<T> other)
+        {
+          *this = *this + other;
+          return *(simd8<T> *)this;
+        }
+        really_inline simd8<T> &operator-=(const simd8<T> other)
+        {
+          *this = *this - other;
+          return *(simd8<T> *)this;
+        }
 
-#endif
-/* end file src/jsoncharutils.h */
-/* begin file src/document_parser_callbacks.h */
-#ifndef SIMDJSON_DOCUMENT_PARSER_CALLBACKS_H
-#define SIMDJSON_DOCUMENT_PARSER_CALLBACKS_H
+        // Override to distinguish from bool version
+        really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
 
+        // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+        template <typename L>
+        really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+        {
+          return _mm256_shuffle_epi8(lookup_table, *this);
+        }
 
-namespace simdjson::dom {
+        // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+        // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+        // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
+        // get written.
+        // Design consideration: it seems like a function with the
+        // signature simd8<L> compress(uint32_t mask) would be
+        // sensible, but the AVX ISA makes this kind of approach difficult.
+        template <typename L>
+        really_inline void compress(uint32_t mask, L *output) const
+        {
+          // this particular implementation was inspired by work done by @animetosho
+          // we do it in four steps, first 8 bytes and then second 8 bytes...
+          uint8_t mask1 = uint8_t(mask);       // least significant 8 bits
+          uint8_t mask2 = uint8_t(mask >> 8);  // second least significant 8 bits
+          uint8_t mask3 = uint8_t(mask >> 16); // ...
+          uint8_t mask4 = uint8_t(mask >> 24); // ...
+          // next line just loads the 64-bit values thintable_epi8[mask1] and
+          // thintable_epi8[mask2] into a 128-bit register, using only
+          // two instructions on most compilers.
+          __m256i shufmask = _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3],
+                                               thintable_epi8[mask2], thintable_epi8[mask1]);
+          // we increment by 0x08 the second half of the mask and so forth
+          shufmask =
+              _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818,
+                                                         0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
+          // this is the version "nearly pruned"
+          __m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
+          // we still need to put the  pieces back together.
+          // we compute the popcount of the first words:
+          int pop1 = BitsSetTable256mul2[mask1];
+          int pop3 = BitsSetTable256mul2[mask3];
 
-//
-// Parser callbacks
-//
+          // then load the corresponding mask
+          // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
+          __m256i v256 = _mm256_castsi128_si256(
+              _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8)));
+          __m256i compactmask = _mm256_insertf128_si256(v256,
+                                                        _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop3 * 8)), 1);
+          __m256i almostthere = _mm256_shuffle_epi8(pruned, compactmask);
+          // We just need to write out the result.
+          // This is the tricky bit that is hard to do
+          // if we want to return a SIMD register, since there
+          // is no single-instruction approach to recombine
+          // the two 128-bit lanes with an offset.
+          __m128i v128;
+          v128 = _mm256_castsi256_si128(almostthere);
+          _mm_storeu_si128((__m128i *)output, v128);
+          v128 = _mm256_extractf128_si256(almostthere, 1);
+          _mm_storeu_si128((__m128i *)(output + 16 - count_ones(mask & 0xFFFF)), v128);
+        }
 
-inline void parser::init_stage2() noexcept {
-  current_string_buf_loc = doc.string_buf.get();
-  current_loc = 0;
-  valid = false;
-  error = UNINITIALIZED;
-}
+        template <typename L>
+        really_inline simd8<L> lookup_16(
+            L replace0, L replace1, L replace2, L replace3,
+            L replace4, L replace5, L replace6, L replace7,
+            L replace8, L replace9, L replace10, L replace11,
+            L replace12, L replace13, L replace14, L replace15) const
+        {
+          return lookup_16(simd8<L>::repeat_16(
+              replace0, replace1, replace2, replace3,
+              replace4, replace5, replace6, replace7,
+              replace8, replace9, replace10, replace11,
+              replace12, replace13, replace14, replace15));
+        }
+      };
 
-really_inline error_code parser::on_error(error_code new_error_code) noexcept {
-  error = new_error_code;
-  return new_error_code;
-}
-really_inline error_code parser::on_success(error_code success_code) noexcept {
-  error = success_code;
-  valid = true;
-  return success_code;
-}
-really_inline bool parser::on_start_document(uint32_t depth) noexcept {
-  containing_scope_offset[depth] = current_loc;
-  write_tape(0, internal::tape_type::ROOT);
-  return true;
-}
-really_inline bool parser::on_start_object(uint32_t depth) noexcept {
-  containing_scope_offset[depth] = current_loc;
-  write_tape(0, internal::tape_type::START_OBJECT);
-  return true;
-}
-really_inline bool parser::on_start_array(uint32_t depth) noexcept {
-  containing_scope_offset[depth] = current_loc;
-  write_tape(0, internal::tape_type::START_ARRAY);
-  return true;
-}
-// TODO we're not checking this bool
-really_inline bool parser::on_end_document(uint32_t depth) noexcept {
-  // write our doc.tape location to the header scope
-  // The root scope gets written *at* the previous location.
-  annotate_previous_loc(containing_scope_offset[depth], current_loc);
-  write_tape(containing_scope_offset[depth], internal::tape_type::ROOT);
-  return true;
-}
-really_inline bool parser::on_end_object(uint32_t depth) noexcept {
-  // write our doc.tape location to the header scope
-  write_tape(containing_scope_offset[depth], internal::tape_type::END_OBJECT);
-  annotate_previous_loc(containing_scope_offset[depth], current_loc);
-  return true;
-}
-really_inline bool parser::on_end_array(uint32_t depth) noexcept {
-  // write our doc.tape location to the header scope
-  write_tape(containing_scope_offset[depth], internal::tape_type::END_ARRAY);
-  annotate_previous_loc(containing_scope_offset[depth], current_loc);
-  return true;
-}
+      // Signed bytes
+      template <>
+      struct simd8<int8_t> : base8_numeric<int8_t>
+      {
+        really_inline simd8() : base8_numeric<int8_t>() {}
+        really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
+        // Splat constructor
+        really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+        // Array constructor
+        really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+        // Member-by-member initialization
+        really_inline simd8(
+            int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+            int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+            int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+            int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31) : simd8(_mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                                     v8, v9, v10, v11, v12, v13, v14, v15,
+                                                                                                                                     v16, v17, v18, v19, v20, v21, v22, v23,
+                                                                                                                                     v24, v25, v26, v27, v28, v29, v30, v31)) {}
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        really_inline static simd8<int8_t> repeat_16(
+            int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+            int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        {
+          return simd8<int8_t>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15,
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
+        }
 
-really_inline bool parser::on_true_atom() noexcept {
-  write_tape(0, internal::tape_type::TRUE_VALUE);
-  return true;
-}
-really_inline bool parser::on_false_atom() noexcept {
-  write_tape(0, internal::tape_type::FALSE_VALUE);
-  return true;
-}
-really_inline bool parser::on_null_atom() noexcept {
-  write_tape(0, internal::tape_type::NULL_VALUE);
-  return true;
-}
+        // Order-sensitive comparisons
+        really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
+        really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
+        really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
+        really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
+      };
 
-really_inline uint8_t *parser::on_start_string() noexcept {
-  /* we advance the point, accounting for the fact that we have a NULL
-    * termination         */
-  write_tape(current_string_buf_loc - doc.string_buf.get(), internal::tape_type::STRING);
-  return current_string_buf_loc + sizeof(uint32_t);
-}
+      // Unsigned bytes
+      template <>
+      struct simd8<uint8_t> : base8_numeric<uint8_t>
+      {
+        really_inline simd8() : base8_numeric<uint8_t>() {}
+        really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
+        // Splat constructor
+        really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+        // Array constructor
+        really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+        // Member-by-member initialization
+        really_inline simd8(
+            uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+            uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+            uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+            uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31) : simd8(_mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                                             v8, v9, v10, v11, v12, v13, v14, v15,
+                                                                                                                                             v16, v17, v18, v19, v20, v21, v22, v23,
+                                                                                                                                             v24, v25, v26, v27, v28, v29, v30, v31)) {}
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        really_inline static simd8<uint8_t> repeat_16(
+            uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+            uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        {
+          return simd8<uint8_t>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15,
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
+        }
 
-really_inline bool parser::on_end_string(uint8_t *dst) noexcept {
-  uint32_t str_length = dst - (current_string_buf_loc + sizeof(uint32_t));
-  // TODO check for overflow in case someone has a crazy string (>=4GB?)
-  // But only add the overflow check when the document itself exceeds 4GB
-  // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
-  memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
-  // NULL termination is still handy if you expect all your strings to
-  // be NULL terminated? It comes at a small cost
-  *dst = 0;
-  current_string_buf_loc = dst + 1;
-  return true;
-}
+        // Saturated math
+        really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
+        really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
 
-really_inline bool parser::on_number_s64(int64_t value) noexcept {
-  write_tape(0, internal::tape_type::INT64);
-  std::memcpy(&doc.tape[current_loc], &value, sizeof(value));
-  ++current_loc;
-  return true;
-}
-really_inline bool parser::on_number_u64(uint64_t value) noexcept {
-  write_tape(0, internal::tape_type::UINT64);
-  doc.tape[current_loc++] = value;
-  return true;
-}
-really_inline bool parser::on_number_double(double value) noexcept {
-  write_tape(0, internal::tape_type::DOUBLE);
-  static_assert(sizeof(value) == sizeof(doc.tape[current_loc]), "mismatch size");
-  memcpy(&doc.tape[current_loc++], &value, sizeof(double));
-  // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
-  return true;
-}
+        // Order-specific operations
+        really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
+        really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
+        // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+        really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
+        // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+        really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
+        really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; }
+        really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; }
+        really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+        really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
 
-really_inline void parser::write_tape(uint64_t val, internal::tape_type t) noexcept {
-  doc.tape[current_loc++] = val | ((static_cast<uint64_t>(static_cast<char>(t))) << 56);
-}
+        // Bit-specific operations
+        really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
+        really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
+        really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+        really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
+        really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
+        really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+        really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
+        really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
+        template <int N>
+        really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+        template <int N>
+        really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+        // Get one of the bits and make a bitmask out of it.
+        // e.g. value.get_bit<7>() gets the high bit
+        template <int N>
+        really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7 - N)); }
+      };
 
-really_inline void parser::annotate_previous_loc(uint32_t saved_loc, uint64_t val) noexcept {
-  doc.tape[saved_loc] |= val;
-}
+      template <typename T>
+      struct simd8x64
+      {
+        static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+        const simd8<T> chunks[NUM_CHUNKS];
 
-} // namespace simdjson::dom
+        really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {}
+        really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
+        really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 32)} {}
 
-#endif // SIMDJSON_DOCUMENT_PARSER_CALLBACKS_H
-/* end file src/document_parser_callbacks.h */
+        template <typename F>
+        static really_inline void each_index(F const &each)
+        {
+          each(0);
+          each(1);
+        }
 
-using namespace simdjson;
+        really_inline void compress(uint64_t mask, T *output) const
+        {
+          uint32_t mask1 = uint32_t(mask);
+          uint32_t mask2 = uint32_t(mask >> 32);
+          this->chunks[0].compress(mask1, output);
+          this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
+        }
 
-#ifdef JSON_TEST_STRINGS
-void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
-                  const uint8_t *parsed_end);
-void found_bad_string(const uint8_t *buf);
-#endif
+        really_inline void store(T ptr[64]) const
+        {
+          this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
+          this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
+        }
 
-#if SIMDJSON_IMPLEMENTATION_ARM64
-/* begin file src/arm64/stage2_build_tape.h */
-#ifndef SIMDJSON_ARM64_STAGE2_BUILD_TAPE_H
-#define SIMDJSON_ARM64_STAGE2_BUILD_TAPE_H
+        really_inline uint64_t to_bitmask() const
+        {
+          uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+          uint64_t r_hi = this->chunks[1].to_bitmask();
+          return r_lo | (r_hi << 32);
+        }
 
-/* arm64/implementation.h already included: #include "arm64/implementation.h" */
-/* begin file src/arm64/stringparsing.h */
-#ifndef SIMDJSON_ARM64_STRINGPARSING_H
-#define SIMDJSON_ARM64_STRINGPARSING_H
+        really_inline simd8x64<T> bit_or(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<T>(
+              this->chunks[0] | mask,
+              this->chunks[1] | mask);
+        }
 
-/* jsoncharutils.h already included: #include "jsoncharutils.h" */
-/* arm64/simd.h already included: #include "arm64/simd.h" */
-/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
-/* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */
+        really_inline uint64_t eq(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<bool>(
+                     this->chunks[0] == mask,
+                     this->chunks[1] == mask)
+              .to_bitmask();
+        }
 
-namespace simdjson::arm64 {
+        really_inline uint64_t lteq(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<bool>(
+                     this->chunks[0] <= mask,
+                     this->chunks[1] <= mask)
+              .to_bitmask();
+        }
+      }; // struct simd8x64<T>
 
-using namespace simd;
+    } // namespace simd
 
-// Holds backslashes and quotes locations.
-struct backslash_and_quote {
-public:
-  static constexpr uint32_t BYTES_PROCESSED = 32;
-  really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+  } // namespace haswell
+} // namespace simdjson
+UNTARGET_REGION
 
-  really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
-  really_inline bool has_backslash() { return bs_bits != 0; }
-  really_inline int quote_index() { return trailing_zeroes(quote_bits); }
-  really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
+#endif // SIMDJSON_HASWELL_SIMD_H
+/* end file src/haswell/bitmanipulation.h */
+/* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
 
-  uint32_t bs_bits;
-  uint32_t quote_bits;
-}; // struct backslash_and_quote
+TARGET_HASWELL
+namespace simdjson
+{
+  namespace haswell
+  {
 
-really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
-  // this can read up to 31 bytes beyond the buffer size, but we require
-  // SIMDJSON_PADDING of padding
-  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1));
-  simd8<uint8_t> v0(src);
-  simd8<uint8_t> v1(src + sizeof(v0));
-  v0.store(dst);
-  v1.store(dst + sizeof(v0));
+    using namespace simd;
 
-  // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we
-  // smash them together into a 64-byte mask and get the bitmask from there.
-  uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
-  return {
-    static_cast<uint32_t>(bs_and_quote),      // bs_bits
-    static_cast<uint32_t>(bs_and_quote >> 32) // quote_bits
-  };
-}
+    struct json_character_block
+    {
+      static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
 
-/* begin file src/generic/stringparsing.h */
-// This file contains the common code every implementation uses
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "stringparsing.h" (this simplifies amalgation)
+      really_inline uint64_t whitespace() const { return _whitespace; }
+      really_inline uint64_t op() const { return _op; }
+      really_inline uint64_t scalar() { return ~(op() | whitespace()); }
 
-namespace stringparsing {
+      uint64_t _whitespace;
+      uint64_t _op;
+    };
 
-// begin copypasta
-// These chars yield themselves: " \ /
-// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
-// u not handled in this table as it's complex
-static const uint8_t escape_map[256] = {
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in)
+    {
+      // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
+      // we can't use the generic lookup_16.
+      auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
+      auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
-    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
-    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+      // We compute whitespace and op separately. If the code later only use one or the
+      // other, given the fact that all functions are aggressively inlined, we can
+      // hope that useless computations will be omitted. This is namely case when
+      // minifying (we only need whitespace).
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+      uint64_t whitespace = simd8x64<bool>(
+                                in.chunks[0] == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, in.chunks[0])),
+                                in.chunks[1] == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, in.chunks[1])))
+                                .to_bitmask();
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-};
+      uint64_t op = simd8x64<bool>(
+                        (in.chunks[0] | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, in.chunks[0] - ',')),
+                        (in.chunks[1] | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, in.chunks[1] - ',')))
+                        .to_bitmask();
+      return {whitespace, op};
+    }
 
-// handle a unicode codepoint
-// write appropriate values into dest
-// src will advance 6 bytes or 12 bytes
-// dest will advance a variable amount (return via pointer)
-// return true if the unicode codepoint was valid
-// We work in little-endian then swap at write time
-WARN_UNUSED
-really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
-                                            uint8_t **dst_ptr) {
-  // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
-  // conversion isn't valid; we defer the check for this to inside the
-  // multilingual plane check
-  uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
-  *src_ptr += 6;
-  // check for low surrogate for characters outside the Basic
-  // Multilingual Plane.
-  if (code_point >= 0xd800 && code_point < 0xdc00) {
-    if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
-      return false;
+    really_inline bool is_ascii(simd8x64<uint8_t> input)
+    {
+      simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]);
+      return !bits.any_bits_set_anywhere(0b10000000u);
     }
-    uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
 
-    // if the first code point is invalid we will get here, as we will go past
-    // the check for being outside the Basic Multilingual plane. If we don't
-    // find a \u immediately afterwards we fail out anyhow, but if we do,
-    // this check catches both the case of the first code point being invalid
-    // or the second code point being invalid.
-    if ((code_point | code_point_2) >> 16) {
-      return false;
+    really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3)
+    {
+      simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+      simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1);  // Only 111_____ will be > 0
+      simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+      // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+      return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
     }
 
-    code_point =
-        (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
-    *src_ptr += 6;
-  }
-  size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
-  *dst_ptr += offset;
-  return offset > 0;
-}
+    really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3)
+    {
+      simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1);  // Only 111_____ will be > 0
+      simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+      // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+      return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+    }
 
-WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
-  src++;
-  while (1) {
-    // Copy the next n bytes, and find the backslash and quote in them.
-    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
-    // If the next thing is the end quote, copy and return
-    if (bs_quote.has_quote_first()) {
-      // we encountered quotes first. Move dst to point to quotes and exit
-      return dst + bs_quote.quote_index();
+    /* begin file src/generic/stage1/buf_block_reader.h */
+    // Walks through a buffer in block-sized increments, loading the last part with spaces
+    template <size_t STEP_SIZE>
+    struct buf_block_reader
+    {
+    public:
+      really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+      really_inline size_t block_index();
+      really_inline bool has_full_block() const;
+      really_inline const uint8_t *full_block() const;
+      /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+      really_inline size_t get_remainder(uint8_t *dst) const;
+      really_inline void advance();
+
+    private:
+      const uint8_t *buf;
+      const size_t len;
+      const size_t lenminusstep;
+      size_t idx;
+    };
+
+    // Routines to print masks and text for debugging bitmask operations
+    UNUSED static char *format_input_text_64(const uint8_t *text)
+    {
+      static char *buf = (char *)malloc(sizeof(simd8x64<uint8_t>) + 1);
+      for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++)
+      {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+      }
+      buf[sizeof(simd8x64<uint8_t>)] = '\0';
+      return buf;
     }
-    if (bs_quote.has_backslash()) {
-      /* find out where the backspace is */
-      auto bs_dist = bs_quote.backslash_index();
-      uint8_t escape_char = src[bs_dist + 1];
-      /* we encountered backslash first. Handle backslash */
-      if (escape_char == 'u') {
-        /* move src/dst up to the start; they will be further adjusted
-           within the unicode codepoint handling code. */
-        src += bs_dist;
-        dst += bs_dist;
-        if (!handle_unicode_codepoint(&src, &dst)) {
-          return nullptr;
+
+    // Routines to print masks and text for debugging bitmask operations
+    UNUSED static char *format_input_text(const simd8x64<uint8_t> in)
+    {
+      static char *buf = (char *)malloc(sizeof(simd8x64<uint8_t>) + 1);
+      in.store((uint8_t *)buf);
+      for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++)
+      {
+        if (buf[i] < ' ')
+        {
+          buf[i] = '_';
         }
-      } else {
-        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
-         * write bs_dist+1 characters to output
-         * note this may reach beyond the part of the buffer we've actually
-         * seen. I think this is ok */
-        uint8_t escape_result = escape_map[escape_char];
-        if (escape_result == 0u) {
-          return nullptr; /* bogus escape value is an error */
-        }
-        dst[bs_dist] = escape_result;
-        src += bs_dist + 2;
-        dst += bs_dist + 1;
       }
-    } else {
-      /* they are the same. Since they can't co-occur, it means we
-       * encountered neither. */
-      src += backslash_and_quote::BYTES_PROCESSED;
-      dst += backslash_and_quote::BYTES_PROCESSED;
+      buf[sizeof(simd8x64<uint8_t>)] = '\0';
+      return buf;
     }
-  }
-  /* can't be reached */
-  return nullptr;
-}
 
-} // namespace stringparsing
-/* end file src/generic/stringparsing.h */
+    UNUSED static char *format_mask(uint64_t mask)
+    {
+      static char *buf = (char *)malloc(64 + 1);
+      for (size_t i = 0; i < 64; i++)
+      {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+      }
+      buf[64] = '\0';
+      return buf;
+    }
 
-}
-// namespace simdjson::amd64
+    template <size_t STEP_SIZE>
+    really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
 
-#endif // SIMDJSON_ARM64_STRINGPARSING_H
-/* end file src/generic/stringparsing.h */
-/* begin file src/arm64/numberparsing.h */
-#ifndef SIMDJSON_ARM64_NUMBERPARSING_H
-#define SIMDJSON_ARM64_NUMBERPARSING_H
+    template <size_t STEP_SIZE>
+    really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
-/* jsoncharutils.h already included: #include "jsoncharutils.h" */
-/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
-/* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */
-#include <cmath>
-#include <limits>
+    template <size_t STEP_SIZE>
+    really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+    {
+      return idx < lenminusstep;
+    }
 
+    template <size_t STEP_SIZE>
+    really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const
+    {
+      return &buf[idx];
+    }
 
-#ifdef JSON_TEST_NUMBERS // for unit testing
-void found_invalid_number(const uint8_t *buf);
-void found_integer(int64_t result, const uint8_t *buf);
-void found_unsigned_integer(uint64_t result, const uint8_t *buf);
-void found_float(double result, const uint8_t *buf);
-#endif
+    template <size_t STEP_SIZE>
+    really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const
+    {
+      memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+      memcpy(dst, buf + idx, len - idx);
+      return len - idx;
+    }
 
-namespace simdjson::arm64 {
+    template <size_t STEP_SIZE>
+    really_inline void buf_block_reader<STEP_SIZE>::advance()
+    {
+      idx += STEP_SIZE;
+    }
+    /* end file src/generic/stage1/buf_block_reader.h */
+    /* begin file src/generic/stage1/json_string_scanner.h */
+    namespace stage1
+    {
 
-// we don't have SSE, so let us use a scalar function
-// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
-static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
-  uint64_t val;
-  memcpy(&val, chars, sizeof(uint64_t));
-  val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
-  val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
-  return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
-}
+      struct json_string_block
+      {
+        // Escaped characters (characters following an escape() character)
+        really_inline uint64_t escaped() const { return _escaped; }
+        // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+        really_inline uint64_t escape() const { return _backslash & ~_escaped; }
+        // Real (non-backslashed) quotes
+        really_inline uint64_t quote() const { return _quote; }
+        // Start quotes of strings
+        really_inline uint64_t string_end() const { return _quote & _in_string; }
+        // End quotes of strings
+        really_inline uint64_t string_start() const { return _quote & ~_in_string; }
+        // Only characters inside the string (not including the quotes)
+        really_inline uint64_t string_content() const { return _in_string & ~_quote; }
+        // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+        really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+        // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+        really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+        // Tail of string (everything except the start quote)
+        really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
 
-#define SWAR_NUMBER_PARSING
+        // backslash characters
+        uint64_t _backslash;
+        // escaped characters (backslashed--does not include the hex characters after \u)
+        uint64_t _escaped;
+        // real quotes (non-backslashed ones)
+        uint64_t _quote;
+        // string characters (includes start quote but not end quote)
+        uint64_t _in_string;
+      };
 
-/* begin file src/generic/numberparsing.h */
-namespace numberparsing {
+      // Scans blocks for string characters, storing the state necessary to do so
+      class json_string_scanner
+      {
+      public:
+        really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
+        really_inline error_code finish(bool streaming);
 
+      private:
+        // Intended to be defined by the implementation
+        really_inline uint64_t find_escaped(uint64_t escape);
+        really_inline uint64_t find_escaped_branchless(uint64_t escape);
 
-// Attempts to compute i * 10^(power) exactly; and if "negative" is
-// true, negate the result.
-// This function will only work in some cases, when it does not work, success is
-// set to false. This should work *most of the time* (like 99% of the time).
-// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
-// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
-really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
-                                      bool *success) {
-  // we start with a fast path
-  // It was described in
-  // Clinger WD. How to read floating point numbers accurately.
-  // ACM SIGPLAN Notices. 1990
-  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
-    // convert the integer into a double. This is lossless since
-    // 0 <= i <= 2^53 - 1.
-    double d = i;
-    //
-    // The general idea is as follows.
-    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
-    // 1) Both s and p can be represented exactly as 64-bit floating-point
-    // values
-    // (binary64).
-    // 2) Because s and p can be represented exactly as floating-point values,
-    // then s * p
-    // and s / p will produce correctly rounded values.
-    //
-    if (power < 0) {
-      d = d / power_of_ten[-power];
-    } else {
-      d = d * power_of_ten[power];
-    }
-    if (negative) {
-      d = -d;
-    }
-    *success = true;
-    return d;
-  }
-  // When 22 < power && power <  22 + 16, we could
-  // hope for another, secondary fast path.  It wa
-  // described by David M. Gay in  "Correctly rounded
-  // binary-decimal and decimal-binary conversions." (1990)
-  // If you need to compute i * 10^(22 + x) for x < 16,
-  // first compute i * 10^x, if you know that result is exact
-  // (e.g., when i * 10^x < 2^53),
-  // then you can still proceed and do (i * 10^x) * 10^22.
-  // Is this worth your time?
-  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
-  // for this second fast path to work.
-  // If you you have 22 < power *and* power <  22 + 16, and then you
-  // optimistically compute "i * 10^(x-22)", there is still a chance that you
-  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
-  // this optimization maybe less common than we would like. Source:
-  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
-  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+        // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+        uint64_t prev_in_string = 0ULL;
+        // Whether the first character of the next iteration is escaped.
+        uint64_t prev_escaped = 0ULL;
+      };
 
-  // The fast path has now failed, so we are failing back on the slower path.
+      //
+      // Finds escaped characters (characters following \).
+      //
+      // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+      //
+      // Does this by:
+      // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+      // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+      // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+      //
+      // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+      // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+      // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+      // the start bit causes a carry), and leaves even-bit sequences alone.
+      //
+      // Example:
+      //
+      // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+      // escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+      // odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+      // even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+      // invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+      // follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+      // escaped        |   x  | x x  x x  x x  x  x  |
+      // desired        |   x  | x x  x x  x x  x  x  |
+      // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+      //
+      really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash)
+      {
+        // If there was overflow, pretend the first character isn't a backslash
+        backslash &= ~prev_escaped;
+        uint64_t follows_escape = backslash << 1 | prev_escaped;
 
-  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
-  // possible, except if i == 0, so we handle i == 0 separately.
-  if(i == 0) {
-    return 0.0;
-  }
+        // Get sequences starting on even bits by clearing out the odd series using +
+        const uint64_t even_bits = 0x5555555555555555ULL;
+        uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+        uint64_t sequences_starting_on_even_bits;
+        prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+        uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
 
-  // We are going to need to do some 64-bit arithmetic to get a more precise product.
-  // We use a table lookup approach.
-  components c =
-      power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
-      // safe because
-      // power >= FASTFLOAT_SMALLEST_POWER
-      // and power <= FASTFLOAT_LARGEST_POWER
-  // we recover the mantissa of the power, it has a leading 1. It is always
-  // rounded down.
-  uint64_t factor_mantissa = c.mantissa;
+        // Mask every other backslashed character as an escaped character
+        // Flip the mask for sequences that start on even bits, to correct them
+        return (even_bits ^ invert_mask) & follows_escape;
+      }
 
-  // We want the most significant bit of i to be 1. Shift if needed.
-  int lz = leading_zeroes(i);
-  i <<= lz;
-  // We want the most significant 64 bits of the product. We know
-  // this will be non-zero because the most significant bit of i is
-  // 1.
-  value128 product = full_multiplication(i, factor_mantissa);
-  uint64_t lower = product.low;
-  uint64_t upper = product.high;
+      //
+      // Return a mask of all string characters plus end quotes.
+      //
+      // prev_escaped is overflow saying whether the next character is escaped.
+      // prev_in_string is overflow saying whether we're still in a string.
+      //
+      // Backslash sequences outside of quotes will be detected in stage 2.
+      //
+      really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in)
+      {
+        const uint64_t backslash = in.eq('\\');
+        const uint64_t escaped = find_escaped(backslash);
+        const uint64_t quote = in.eq('"') & ~escaped;
 
-  // We know that upper has at most one leading zero because
-  // both i and  factor_mantissa have a leading one. This means
-  // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
+        //
+        // prefix_xor flips on bits inside the string (and flips off the end quote).
+        //
+        // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+        // (characters inside strings are outside, and characters outside strings are inside).
+        //
+        const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
 
-  // As long as the first 9 bits of "upper" are not "1", then we
-  // know that we have an exact computed value for the leading
-  // 55 bits because any imprecision would play out as a +1, in
-  // the worst case.
-  if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower)) {
-    uint64_t factor_mantissa_low =
-        mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
-    // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
-    // result (three 64-bit values)
-    product = full_multiplication(i, factor_mantissa_low);
-    uint64_t product_low = product.low;
-    uint64_t product_middle2 = product.high;
-    uint64_t product_middle1 = lower;
-    uint64_t product_high = upper;
-    uint64_t product_middle = product_middle1 + product_middle2;
-    if (product_middle < product_middle1) {
-      product_high++; // overflow carry
-    }
-    // We want to check whether mantissa *i + i would affect our result.
-    // This does happen, e.g. with 7.3177701707893310e+15.
-    if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
-         (product_low + i < product_low))) { // let us be prudent and bail out.
-      *success = false;
-      return 0;
-    }
-    upper = product_high;
-    lower = product_middle;
-  }
-  // The final mantissa should be 53 bits with a leading 1.
-  // We shift it so that it occupies 54 bits with a leading 1.
-  ///////
-  uint64_t upperbit = upper >> 63;
-  uint64_t mantissa = upper >> (upperbit + 9);
-  lz += 1 ^ upperbit;
+        //
+        // Check if we're still in a string at the end of the box so the next block will know
+        //
+        // right shift of a signed value expected to be well-defined and standard
+        // compliant as of C++20, John Regher from Utah U. says this is fine code
+        //
+        prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
 
-  // Here we have mantissa < (1<<54).
+        // Use ^ to turn the beginning quote off, and the end quote on.
+        return {
+            backslash,
+            escaped,
+            quote,
+            in_string};
+      }
 
-  // We have to round to even. The "to even" part
-  // is only a problem when we are right in between two floats
-  // which we guard against.
-  // If we have lots of trailing zeros, we may fall right between two
-  // floating-point values.
-  if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
-               ((mantissa & 3) == 1))) {
-      // if mantissa & 1 == 1 we might need to round up.
+      really_inline error_code json_string_scanner::finish(bool streaming)
+      {
+        if (prev_in_string and (not streaming))
+        {
+          return UNCLOSED_STRING;
+        }
+        return SUCCESS;
+      }
+
+    } // namespace stage1
+    /* end file src/generic/stage1/json_string_scanner.h */
+    /* begin file src/generic/stage1/json_scanner.h */
+    namespace stage1
+    {
+
+      /**
+ * A block of scanned json, with information on operators and scalars.
+ */
+      struct json_block
+      {
+      public:
+        /** The start of structurals */
+        really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
+        /** All JSON whitespace (i.e. not in a string) */
+        really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
+
+        // Helpers
+
+        /** Whether the given characters are inside a string (only works on non-quotes) */
+        really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
+        /** Whether the given characters are outside a string (only works on non-quotes) */
+        really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
+
+        // string and escape characters
+        json_string_block _string;
+        // whitespace, operators, scalars
+        json_character_block _characters;
+        // whether the previous character was a scalar
+        uint64_t _follows_potential_scalar;
+
+      private:
+        // Potential structurals (i.e. disregarding strings)
+
+        /** operators plus scalar starts like 123, true and "abc" */
+        really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
+        /** the start of non-operator runs, like 123, true and "abc" */
+        really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
+        /** whether the given character is immediately after a non-operator like 123, true or " */
+        really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
+      };
+
+      /**
+ * Scans JSON for important bits: operators, strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+      class json_scanner
+      {
+      public:
+        json_scanner() {}
+        really_inline json_block next(const simd::simd8x64<uint8_t> in);
+        really_inline error_code finish(bool streaming);
+
+      private:
+        // Whether the last character of the previous iteration is part of a scalar token
+        // (anything except whitespace or an operator).
+        uint64_t prev_scalar = 0ULL;
+        json_string_scanner string_scanner{};
+      };
+
       //
-      // Scenarios:
-      // 1. We are not in the middle. Then we should round up.
+      // Check if the current character immediately follows a matching character.
       //
-      // 2. We are right in the middle. Whether we round up depends
-      // on the last significant bit: if it is "one" then we round
-      // up (round to even) otherwise, we do not.
+      // For example, this checks for quotes with backslashes in front of them:
       //
-      // So if the last significant bit is 1, we can safely round up.
-      // Hence we only need to bail out if (mantissa & 3) == 1.
-      // Otherwise we may need more accuracy or analysis to determine whether
-      // we are exactly between two floating-point numbers.
-      // It can be triggered with 1e23.
-      // Note: because the factor_mantissa and factor_mantissa_low are
-      // almost always rounded down (except for small positive powers),
-      // almost always should round up.
-      *success = false;
-      return 0;
-  }
+      //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+      //
+      really_inline uint64_t follows(const uint64_t match, uint64_t &overflow)
+      {
+        const uint64_t result = match << 1 | overflow;
+        overflow = match >> 63;
+        return result;
+      }
 
-  mantissa += mantissa & 1;
-  mantissa >>= 1;
+      //
+      // Check if the current character follows a matching character, with possible "filler" between.
+      // For example, this checks for empty curly braces, e.g.
+      //
+      //     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
+      //
+      really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow)
+      {
+        uint64_t follows_match = follows(match, overflow);
+        uint64_t result;
+        overflow |= uint64_t(add_overflow(follows_match, filler, &result));
+        return result;
+      }
 
-  // Here we have mantissa < (1<<53), unless there was an overflow
-  if (mantissa >= (1ULL << 53)) {
-    //////////
-    // This will happen when parsing values such as 7.2057594037927933e+16
-    ////////
-    mantissa = (1ULL << 52);
-    lz--; // undo previous addition
-  }
-  mantissa &= ~(1ULL << 52);
-  uint64_t real_exponent = c.exp - lz;
-  // we have to check that real_exponent is in range, otherwise we bail out
-  if (unlikely((real_exponent < 1) || (real_exponent > 2046))) {
-    *success = false;
-    return 0;
-  }
-  mantissa |= real_exponent << 52;
-  mantissa |= (((uint64_t)negative) << 63);
-  double d;
-  memcpy(&d, &mantissa, sizeof(d));
-  *success = true;
-  return d;
-}
+      really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in)
+      {
+        json_string_block strings = string_scanner.next(in);
+        json_character_block characters = json_character_block::classify(in);
+        uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
+        return {
+            strings,
+            characters,
+            follows_scalar};
+      }
 
-static bool parse_float_strtod(const char *ptr, double *outDouble) {
-  char *endptr;
-  *outDouble = strtod(ptr, &endptr);
-  // Some libraries will set errno = ERANGE when the value is subnormal,
-  // yet we may want to be able to parse subnormal values.
-  // However, we do not want to tolerate NAN or infinite values.
-  //
-  // Values like infinity or NaN are not allowed in the JSON specification.
-  // If you consume a large value and you map it to "infinity", you will no
-  // longer be able to serialize back a standard-compliant JSON. And there is
-  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
-  // 10^308 It is an unimaginable large number. There will never be any piece of
-  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
-  // of electrons is similar. Using a double-precision floating-point value, we
-  // can represent easily the number of atoms in the universe. We could  also
-  // represent the number of ways you can pick any three individual atoms at
-  // random in the universe. If you ever encounter a number much larger than
-  // 10^308, you know that you have a bug. RapidJSON will reject a document with
-  // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
-  // will flat out throw an exception.
-  //
-  if ((endptr == ptr) || (!std::isfinite(*outDouble))) {
-    return false;
-  }
-  return true;
-}
+      really_inline error_code json_scanner::finish(bool streaming)
+      {
+        return string_scanner.finish(streaming);
+      }
 
-really_inline bool is_integer(char c) {
-  return (c >= '0' && c <= '9');
-  // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
-}
+    } // namespace stage1
+    /* end file src/generic/stage1/json_scanner.h */
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    namespace stage1
+    {
+      really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash)
+      {
+        if (!backslash)
+        {
+          uint64_t escaped = prev_escaped;
+          prev_escaped = 0;
+          return escaped;
+        }
+        return find_escaped_branchless(backslash);
+      }
+    } // namespace stage1
 
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
+    /* begin file src/generic/stage1/json_minifier.h */
+    // This file contains the common code every implementation uses in stage1
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is included already includes
+    // "simdjson/stage1.h" (this simplifies amalgation)
 
-// check quickly whether the next 8 chars are made of digits
-// at a glance, it looks better than Mula's
-// http://0x80.pl/articles/swar-digits-validate.html
-really_inline bool is_made_of_eight_digits_fast(const char *chars) {
-  uint64_t val;
-  // this can read up to 7 bytes beyond the buffer size, but we require
-  // SIMDJSON_PADDING of padding
-  static_assert(7 <= SIMDJSON_PADDING);
-  memcpy(&val, chars, 8);
-  // a branchy method might be faster:
-  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
-  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
-  //  0x3030303030303030);
-  return (((val & 0xF0F0F0F0F0F0F0F0) |
-           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
-          0x3333333333333333);
-}
+    namespace stage1
+    {
 
-// called by parse_number when we know that the output is an integer,
-// but where there might be some integer overflow.
-// we want to catch overflows!
-// Do not call this function directly as it skips some of the checks from
-// parse_number
-//
-// This function will almost never be called!!!
-//
-never_inline bool parse_large_integer(const uint8_t *const src,
-                                      parser &parser,
-                                      bool found_minus) {
-  const char *p = reinterpret_cast<const char *>(src);
+      class json_minifier
+      {
+      public:
+        template <size_t STEP_SIZE>
+        static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
 
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-  }
-  uint64_t i;
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    i = 0;
-  } else {
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      if (mul_overflow(i, 10, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false; // overflow
+      private:
+        really_inline json_minifier(uint8_t *_dst)
+            : dst{_dst}
+        {
+        }
+        template <size_t STEP_SIZE>
+        really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+        really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
+        really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+        json_scanner scanner{};
+        uint8_t *dst;
+      };
+
+      really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block)
+      {
+        uint64_t mask = block.whitespace();
+        in.compress(mask, dst);
+        dst += 64 - count_ones(mask);
       }
-      if (add_overflow(i, digit, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false; // overflow
+
+      really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len)
+      {
+        *dst = '\0';
+        error_code error = scanner.finish(false);
+        if (error)
+        {
+          dst_len = 0;
+          return error;
+        }
+        dst_len = dst - dst_start;
+        return SUCCESS;
       }
-      ++p;
-    }
-  }
-  if (negative) {
-    if (i > 0x8000000000000000) {
-      // overflows!
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false; // overflow
-    } else if (i == 0x8000000000000000) {
-      // In two's complement, we cannot represent 0x8000000000000000
-      // as a positive signed integer, but the negative version is
-      // possible.
-      constexpr int64_t signed_answer = INT64_MIN;
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    } else {
-      // we can negate safely
-      int64_t signed_answer = -static_cast<int64_t>(i);
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    }
-  } else {
-    // we have a positive integer, the contract is that
-    // we try to represent it as a signed integer and only
-    // fallback on unsigned integers if absolutely necessary.
-    if (i < 0x8000000000000000) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(i, src);
-#endif
-      parser.on_number_s64(i);
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_unsigned_integer(i, src);
-#endif
-      parser.on_number_u64(i);
-    }
-  }
-  return is_structural_or_whitespace(*p);
-}
 
-bool slow_float_parsing(UNUSED const char * src, parser &parser) {
-  double d;
-  if (parse_float_strtod(src, &d)) {
-    parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_float(d, (const uint8_t *)src);
-#endif
-    return true;
-  }
-#ifdef JSON_TEST_NUMBERS // for unit testing
-  found_invalid_number((const uint8_t *)src);
-#endif
-  return false;
-}
+      template <>
+      really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block_buf);
+        simd::simd8x64<uint8_t> in_2(block_buf + 64);
+        json_block block_1 = scanner.next(in_1);
+        json_block block_2 = scanner.next(in_2);
+        this->next(in_1, block_1);
+        this->next(in_2, block_2);
+        reader.advance();
+      }
 
-// parse the number at src
-// define JSON_TEST_NUMBERS for unit testing
-//
-// It is assumed that the number is followed by a structural ({,},],[) character
-// or a white space character. If that is not the case (e.g., when the JSON
-// document is made of a single number), then it is necessary to copy the
-// content and append a space before calling this function.
-//
-// Our objective is accurate parsing (ULP of 0) at high speed.
-really_inline bool parse_number(UNUSED const uint8_t *const src,
-                                UNUSED bool found_minus,
-                                parser &parser) {
-#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
-                                  // useful to skip parsing
-  parser.on_number_s64(0);        // always write zero
-  return true;                    // always succeeds
-#else
-  const char *p = reinterpret_cast<const char *>(src);
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-    if (!is_integer(*p)) { // a negative sign must be followed by an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-  }
-  const char *const start_digits = p;
+      template <>
+      really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block_buf);
+        json_block block_1 = scanner.next(in_1);
+        this->next(block_buf, block_1);
+        reader.advance();
+      }
 
-  uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
+      template <size_t STEP_SIZE>
+      error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept
+      {
+        buf_block_reader<STEP_SIZE> reader(buf, len);
+        json_minifier minifier(dst);
+
+        // Index the first n-1 blocks
+        while (reader.has_full_block())
+        {
+          minifier.step<STEP_SIZE>(reader.full_block(), reader);
+        }
+
+        // Index the last (remainder) block, padded with spaces
+        uint8_t block[STEP_SIZE];
+        if (likely(reader.get_remainder(block)) > 0)
+        {
+          minifier.step<STEP_SIZE>(block, reader);
+        }
+
+        return minifier.finish(dst, dst_len);
+      }
+
+    } // namespace stage1
+    /* end file src/generic/stage1/json_minifier.h */
+    WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept
+    {
+      return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
     }
-    i = 0;
-  } else {
-    if (!(is_integer(*p))) { // must start with an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      // a multiplication by 10 is cheaper than an arbitrary integer
-      // multiplication
-      i = 10 * i + digit; // might overflow, we will handle the overflow later
-      ++p;
-    }
-  }
-  int64_t exponent = 0;
-  bool is_float = false;
-  if ('.' == *p) {
-    is_float = true; // At this point we know that we have a float
-    // we continue with the fiction that we have an integer. If the
-    // floating point number is representable as x * 10^z for some integer
-    // z that fits in 53 bits, then we will be able to convert back the
-    // the integer into a float in a lossless manner.
-    ++p;
-    const char *const first_after_period = p;
-    if (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // might overflow + multiplication by 10 is likely
-                          // cheaper than arbitrary mult.
-      // we will handle the overflow later
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-#ifdef SWAR_NUMBER_PARSING
-    // this helps if we have lots of decimals!
-    // this turns out to be frequent enough.
-    if (is_made_of_eight_digits_fast(p)) {
-      i = i * 100000000 + parse_eight_digits_unrolled(p);
-      p += 8;
-    }
-#endif
-    while (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
-                          // because we have parse_highprecision_float later.
-    }
-    exponent = first_after_period - p;
-  }
-  int digit_count =
-      p - start_digits - 1; // used later to guard against overflows
-  int64_t exp_number = 0;   // exponential part
-  if (('e' == *p) || ('E' == *p)) {
-    is_float = true;
-    ++p;
-    bool neg_exp = false;
-    if ('-' == *p) {
-      neg_exp = true;
-      ++p;
-    } else if ('+' == *p) {
-      ++p;
-    }
-    if (!is_integer(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    unsigned char digit = *p - '0';
-    exp_number = digit;
-    p++;
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    while (is_integer(*p)) {
-      if (exp_number > 0x100000000) { // we need to check for overflows
-                                      // we refuse to parse this
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false;
+
+    /* begin file src/generic/stage1/find_next_document_index.h */
+    /**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+    really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser)
+    {
+      // TODO don't count separately, just figure out depth
+      auto arr_cnt = 0;
+      auto obj_cnt = 0;
+      for (auto i = parser.n_structural_indexes - 1; i > 0; i--)
+      {
+        auto idxb = parser.structural_indexes[i];
+        switch (parser.buf[idxb])
+        {
+        case ':':
+        case ',':
+          continue;
+        case '}':
+          obj_cnt--;
+          continue;
+        case ']':
+          arr_cnt--;
+          continue;
+        case '{':
+          obj_cnt++;
+          break;
+        case '[':
+          arr_cnt++;
+          break;
+        }
+        auto idxa = parser.structural_indexes[i - 1];
+        switch (parser.buf[idxa])
+        {
+        case '{':
+        case '[':
+        case ':':
+        case ',':
+          continue;
+        }
+        // Last document is complete, so the next document will appear after!
+        if (!arr_cnt && !obj_cnt)
+        {
+          return parser.n_structural_indexes;
+        }
+        // Last document is incomplete; mark the document at i + 1 as the next one
+        return i;
       }
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
+      return 0;
     }
-    exponent += (neg_exp ? -exp_number : exp_number);
-  }
-  if (is_float) {
-    // If we frequently had to deal with long strings of digits,
-    // we could extend our code by using a 128-bit integer instead
-    // of a 64-bit integer. However, this is uncommon in practice.
-    if (unlikely((digit_count >= 19))) { // this is uncommon
-      // It is possible that the integer had an overflow.
-      // We have to handle the case where we have 0.0000somenumber.
-      const char *start = start_digits;
-      while ((*start == '0') || (*start == '.')) {
-        start++;
+
+    // Skip the last character if it is partial
+    really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len)
+    {
+      if (unlikely(len < 3))
+      {
+        switch (len)
+        {
+        case 2:
+          if (buf[len - 1] >= 0b11000000)
+          {
+            return len - 1;
+          } // 2-, 3- and 4-byte characters with only 1 byte left
+          if (buf[len - 2] >= 0b11100000)
+          {
+            return len - 2;
+          } // 3- and 4-byte characters with only 2 bytes left
+          return len;
+        case 1:
+          if (buf[len - 1] >= 0b11000000)
+          {
+            return len - 1;
+          } // 2-, 3- and 4-byte characters with only 1 byte left
+          return len;
+        case 0:
+          return len;
+        }
       }
-      // we over-decrement by one when there is a '.'
-      digit_count -= (start - start_digits);
-      if (digit_count >= 19) {
-        // Ok, chances are good that we had an overflow!
-        // this is almost never going to get called!!!
-        // we start anew, going slowly!!!
-        // This will happen in the following examples:
-        // 10000000000000000000000000000000000000000000e+308
-        // 3.1415926535897932384626433832795028841971693993751
-        //
-        return slow_float_parsing((const char *) src, parser);
-      }
+      if (buf[len - 1] >= 0b11000000)
+      {
+        return len - 1;
+      } // 2-, 3- and 4-byte characters with only 1 byte left
+      if (buf[len - 2] >= 0b11100000)
+      {
+        return len - 2;
+      } // 3- and 4-byte characters with only 1 byte left
+      if (buf[len - 3] >= 0b11110000)
+      {
+        return len - 3;
+      } // 4-byte characters with only 3 bytes left
+      return len;
     }
-    if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
-        (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
-      // this is almost never going to get called!!!
-      // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, parser);
-    }
-    bool success = true;
-    double d = compute_float_64(exponent, i, negative, &success);
-    if (!success) {
-      // we are almost never going to get here.
-      success = parse_float_strtod((const char *)src, &d);
-    }
-    if (success) {
-      parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_float(d, src);
-#endif
-      return true;
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-  } else {
-    if (unlikely(digit_count >= 18)) { // this is uncommon!!!
-      // there is a good chance that we had an overflow, so we need
-      // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, parser, found_minus);
-    }
-    i = negative ? 0 - i : i;
-    parser.on_number_s64(i);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_integer(i, src);
-#endif
-  }
-  return is_structural_or_whitespace(*p);
-#endif // SIMDJSON_SKIPNUMBERPARSING
-}
+    /* end file src/generic/stage1/find_next_document_index.h */
+    /* begin file src/generic/stage1/utf8_lookup3_algorithm.h */
+    //
+    // Detect Unicode errors.
+    //
+    // UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
+    // encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
+    // are straight up concatenated into the final value. The first byte of a multibyte character is a
+    // "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
+    // lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
+    // start with 0, because that's what ASCII looks like. Here's what each size looks like:
+    //
+    // - ASCII (7 bits):              0_______
+    // - 2 byte character (11 bits):  110_____ 10______
+    // - 3 byte character (17 bits):  1110____ 10______ 10______
+    // - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
+    // - 5+ byte character (illegal): 11111___ <illegal>
+    //
+    // There are 5 classes of error that can happen in Unicode:
+    //
+    // - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
+    //   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
+    //   character.
+    //
+    //   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
+    //
+    // - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
+    //   We detect this by requiring that the next byte after your multibyte character be a new
+    //   character--so a continuation after your character is wrong.
+    //
+    //   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
+    //
+    // - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
+    //
+    //   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
+    //
+    // - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
+    //   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
+    //   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
+    //
+    //   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
+    //
+    // - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
+    //   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
+    //
+    //   e.g. 11101101 10100000 10000000 (U+D800)
+    //
+    // - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
+    //   support values with more than 23 bits (which a 4-byte character supports).
+    //
+    //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
+    //
+    // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
+    //
+    //   Code Points        1st       2s       3s       4s
+    //  U+0000..U+007F     00..7F
+    //  U+0080..U+07FF     C2..DF   80..BF
+    //  U+0800..U+0FFF     E0       A0..BF   80..BF
+    //  U+1000..U+CFFF     E1..EC   80..BF   80..BF
+    //  U+D000..U+D7FF     ED       80..9F   80..BF
+    //  U+E000..U+FFFF     EE..EF   80..BF   80..BF
+    //  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+    //  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+    //  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+    //
+    using namespace simd;
 
-} // namespace numberparsing
-/* end file src/generic/numberparsing.h */
+    namespace utf8_validation
+    {
+      // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
-} // namespace simdjson::arm64
+      //
+      // Find special case UTF-8 errors where the character is technically readable (has the right length)
+      // but the *value* is disallowed.
+      //
+      // This includes overlong encodings, surrogates and values too large for Unicode.
+      //
+      // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
+      // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
+      // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
+      // If all 3 lookups detect the same error, it's an error.
+      //
+      really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+      {
+        //
+        // These are the errors we're going to match for bytes 1-2, by looking at the first three
+        // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
+        //
+        static const int OVERLONG_2 = 0x01;  // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
+        static const int OVERLONG_3 = 0x02;  // 11100000 100_____ ________
+        static const int OVERLONG_4 = 0x04;  // 11110000 1000____ ________ ________
+        static const int SURROGATE = 0x08;   // 11101101 [101_]____
+        static const int TOO_LARGE = 0x10;   // 11110100 (1001|101_)____
+        static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
 
-#endif // SIMDJSON_ARM64_NUMBERPARSING_H
-/* end file src/generic/numberparsing.h */
+        // New with lookup3. We want to catch the case where an non-continuation
+        // follows a leading byte
+        static const int TOO_SHORT_2_3_4 = 0x40; //  (110_|1110|1111) ____    (0___|110_|1111) ____
+        // We also want to catch a continuation that is preceded by an ASCII byte
+        static const int LONELY_CONTINUATION = 0x80; //  0___ ____    01__ ____
 
-namespace simdjson::arm64 {
+        // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
+        // byte 2 to be sure which things are errors and which aren't.
+        // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
+        static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
+        const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+            // ASCII: ________ [0___]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            // ASCII: ________ [0___]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            // Continuations: ________ [10__]____
+            CARRY | OVERLONG_3 | OVERLONG_4 | LONELY_CONTINUATION, // ________ [1000]____
+            CARRY | OVERLONG_3 | TOO_LARGE | LONELY_CONTINUATION,  // ________ [1001]____
+            CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION,   // ________ [1010]____
+            CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION,   // ________ [1011]____
+            // Multibyte Leads: ________ [11__]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4, // 110_
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4);
+        const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+            // [0___]____ (ASCII)
+            LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+            LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+            // [10__]____ (continuation)
+            0, 0, 0, 0,
+            // [11__]____ (2+-byte leads)
+            OVERLONG_2 | TOO_SHORT_2_3_4, TOO_SHORT_2_3_4,         // [110_]____ (2-byte lead)
+            OVERLONG_3 | SURROGATE | TOO_SHORT_2_3_4,              // [1110]____ (3-byte lead)
+            OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 | TOO_SHORT_2_3_4 // [1111]____ (4+-byte lead)
+        );
+        const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+            // ____[00__] ________
+            OVERLONG_2 | OVERLONG_3 | OVERLONG_4 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0000] ________
+            OVERLONG_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,                           // ____[0001] ________
+            TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[01__] ________
+            TOO_LARGE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0100] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[10__] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[11__] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            TOO_LARGE_2 | SURROGATE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[1101] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION);
+        return byte_1_high & byte_1_low & byte_2_high;
+      }
 
-/* begin file src/generic/atomparsing.h */
-namespace atomparsing {
+      really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input,
+                                                           simd8<uint8_t> prev1)
+      {
+        simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+        simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+        // is_2_3_continuation uses one more instruction than lookup2
+        simd8<bool> is_2_3_continuation = (simd8<int8_t>(input).max(simd8<int8_t>(prev1))) < int8_t(-64);
+        // must_be_2_3_continuation has two fewer instructions than lookup 2
+        return simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3) ^ is_2_3_continuation);
+      }
 
-really_inline uint32_t string_to_uint32(const char* str) { return *reinterpret_cast<const uint32_t *>(str); }
+      //
+      // Return nonzero if there are incomplete multibyte characters at the end of the block:
+      // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+      //
+      really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input)
+      {
+        // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+        // ... 1111____ 111_____ 11______
+        static const uint8_t max_array[32] = {
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1};
+        const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+        return input.gt_bits(max_value);
+      }
 
-WARN_UNUSED
-really_inline bool str4ncmp(const uint8_t *src, const char* atom) {
-  uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
-  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING);
-  std::memcpy(&srcval, src, sizeof(uint32_t));
-  return srcval ^ string_to_uint32(atom);
-}
+      struct utf8_checker
+      {
+        // If this is nonzero, there has been a UTF-8 error.
+        simd8<uint8_t> error;
+        // The last input we received
+        simd8<uint8_t> prev_input_block;
+        // Whether the last input we received was incomplete (used for ASCII fast path)
+        simd8<uint8_t> prev_incomplete;
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src) {
-  return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+        //
+        // Check whether the current bytes are valid UTF-8.
+        //
+        really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+        {
+          // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+          // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+          simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+          this->error |= check_special_cases(input, prev1);
+          this->error |= check_multibyte_lengths(input, prev_input, prev1);
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_true_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "true"); }
-  else { return false; }
-}
+        // The only problem that can happen at EOF is that a multibyte character is too short.
+        really_inline void check_eof()
+        {
+          // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+          // possibly finish them.
+          this->error |= this->prev_incomplete;
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src) {
-  return (str4ncmp(src+1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
-}
+        really_inline void check_next_input(simd8x64<uint8_t> input)
+        {
+          if (likely(is_ascii(input)))
+          {
+            // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+            // possibly finish them.
+            this->error |= this->prev_incomplete;
+          }
+          else
+          {
+            this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+            for (int i = 1; i < simd8x64<uint8_t>::NUM_CHUNKS; i++)
+            {
+              this->check_utf8_bytes(input.chunks[i], input.chunks[i - 1]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+          }
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
-  if (len > 5) { return is_valid_false_atom(src); }
-  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
-  else { return false; }
-}
+        really_inline error_code errors()
+        {
+          return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src) {
-  return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+      }; // struct utf8_checker
+    }    // namespace utf8_validation
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_null_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "null"); }
-  else { return false; }
-}
+    using utf8_validation::utf8_checker;
+    /* end file src/generic/stage1/utf8_lookup3_algorithm.h */
+    /* begin file src/generic/stage1/json_structural_indexer.h */
+    // This file contains the common code every implementation uses in stage1
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is included already includes
+    // "simdjson/stage1.h" (this simplifies amalgation)
 
-} // namespace atomparsing
-/* end file src/generic/atomparsing.h */
-/* begin file src/generic/stage2_build_tape.h */
-// This file contains the common code every implementation uses for stage2
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
+    namespace stage1
+    {
 
-namespace stage2 {
+      class bit_indexer
+      {
+      public:
+        uint32_t *tail;
 
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address;
-#define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
-#define GOTO(address) { goto *(address); }
-#define CONTINUE(address) { goto *(address); }
-#else
-typedef char ret_address;
-#define INIT_ADDRESSES() { '[', 'a', 'e', 'f', '{', 'o' };
-#define GOTO(address)                 \
-  {                                   \
-    switch(address) {                 \
-      case '[': goto array_begin;     \
-      case 'a': goto array_continue;  \
-      case 'e': goto error;           \
-      case 'f': goto finish;          \
-      case '{': goto object_begin;    \
-      case 'o': goto object_continue; \
-    }                                 \
-  }
-// For the more constrained end_xxx() situation
-#define CONTINUE(address)             \
-  {                                   \
-    switch(address) {                 \
-      case 'a': goto array_continue;  \
-      case 'o': goto object_continue; \
-      case 'f': goto finish;          \
-    }                                 \
-  }
-#endif
+        really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
-struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
-};
+        // flatten out values in 'bits' assuming that they are are to have values of idx
+        // plus their position in the bitvector, and store these indexes at
+        // base_ptr[base] incrementing base as we go
+        // will potentially store extra values beyond end of valid bits, so base_ptr
+        // needs to be large enough to handle this
+        really_inline void write(uint32_t idx, uint64_t bits)
+        {
+          // In some instances, the next branch is expensive because it is mispredicted.
+          // Unfortunately, in other cases,
+          // it helps tremendously.
+          if (bits == 0)
+            return;
+          int cnt = static_cast<int>(count_ones(bits));
 
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
+          // Do the first 8 all together
+          for (int i = 0; i < 8; i++)
+          {
+            this->tail[i] = idx + trailing_zeroes(bits);
+            bits = clear_lowest_bit(bits);
+          }
 
-class structural_iterator {
-public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf}, len{_len}, structural_indexes{_structural_indexes}, next_structural{next_structural_index} {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
-  }
-  really_inline char current_char() {
-    return c;
-  }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
-  }
-  really_inline size_t remaining_len() {
-    return len - idx;
-  }
-  template<typename F>
-  really_inline bool with_space_terminated_copy(const F& f) {
-    /**
-    * We need to make a copy to make sure that the string is space terminated.
-    * This is not about padding the input, which should already padded up
-    * to len + SIMDJSON_PADDING. However, we have no control at this stage
-    * on how the padding was done. What if the input string was padded with nulls?
-    * It is quite common for an input string to have an extra null character (C string).
-    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
-    * document, but the string "9\0" by itself is fine. So we make a copy and
-    * pad the input with spaces when we know that there is just one input element.
-    * This copy is relatively expensive, but it will almost never be called in
-    * practice unless you are in the strange scenario where you have many JSON
-    * documents made of single atoms.
-    */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if (copy == nullptr) {
-      return true;
-    }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
-    free(copy);
-    return result;
-  }
-  really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
-  }
-  really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
-  }
-  really_inline size_t next_structural_index() {
-    return next_structural;
-  }
+          // Do the next 8 all together (we hope in most cases it won't happen at all
+          // and the branch is easily predicted).
+          if (unlikely(cnt > 8))
+          {
+            for (int i = 8; i < 16; i++)
+            {
+              this->tail[i] = idx + trailing_zeroes(bits);
+              bits = clear_lowest_bit(bits);
+            }
 
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx; // location of the structural character in the input (buf)
-  uint8_t c;  // used to track the (structural) character we are looking at
-};
+            // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+            // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+            // or the start of a value ("abc" true 123) every four characters.
+            if (unlikely(cnt > 16))
+            {
+              int i = 16;
+              do
+              {
+                this->tail[i] = idx + trailing_zeroes(bits);
+                bits = clear_lowest_bit(bits);
+                i++;
+              } while (i < cnt);
+            }
+          }
 
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
-  uint32_t depth;
+          this->tail += cnt;
+        }
+      };
 
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+      class json_structural_indexer
+      {
+      public:
+        /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+        template <size_t STEP_SIZE>
+        static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    doc_parser.on_start_document(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+      private:
+        really_inline json_structural_indexer(uint32_t *structural_indexes);
+        template <size_t STEP_SIZE>
+        really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+        really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
+        really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    doc_parser.on_start_object(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+        json_scanner scanner{};
+        utf8_checker checker{};
+        bit_indexer indexer;
+        uint64_t prev_structurals = 0;
+        uint64_t unescaped_chars_error = 0;
+      };
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    doc_parser.on_start_array(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+      really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
 
-  really_inline bool end_object() {
-    depth--;
-    doc_parser.on_end_object(depth);
-    return false;
-  }
-  really_inline bool end_array() {
-    depth--;
-    doc_parser.on_end_array(depth);
-    return false;
-  }
-  really_inline bool end_document() {
-    depth--;
-    doc_parser.on_end_document(depth);
-    return false;
-  }
+      //
+      // PERF NOTES:
+      // We pipe 2 inputs through these stages:
+      // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+      //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+      // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+      //    The output of step 1 depends entirely on this information. These functions don't quite use
+      //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+      //    at a time. The second input's scans has some dependency on the first ones finishing it, but
+      //    they can make a lot of progress before they need that information.
+      // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+      //    to finish: utf-8 checks and generating the output from the last iteration.
+      //
+      // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+      // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+      // workout.
+      //
+      template <size_t STEP_SIZE>
+      error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept
+      {
+        if (unlikely(len > parser.capacity()))
+        {
+          return CAPACITY;
+        }
+        if (partial)
+        {
+          len = trim_partial_utf8(buf, len);
+        }
 
-  WARN_UNUSED really_inline bool parse_string() {
-    uint8_t *dst = doc_parser.on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
-    if (dst == nullptr) {
-      return true;
-    }
-    return !doc_parser.on_end_string(dst);
-  }
+        buf_block_reader<STEP_SIZE> reader(buf, len);
+        json_structural_indexer indexer(parser.structural_indexes.get());
 
-  WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    return !numberparsing::parse_number(src, found_minus, doc_parser);
-  }
-  WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
-  }
+        // Read all but the last block
+        while (reader.has_full_block())
+        {
+          indexer.step<STEP_SIZE>(reader.full_block(), reader);
+        }
 
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
+        // Take care of the last block (will always be there unless file is empty)
+        uint8_t block[STEP_SIZE];
+        if (unlikely(reader.get_remainder(block) == 0))
+        {
+          return EMPTY;
+        }
+        indexer.step<STEP_SIZE>(block, reader);
 
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
+        return indexer.finish(parser, reader.block_index(), len, partial);
+      }
 
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
-    case '"':
-      FAIL_IF( parse_string() );
-      return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
-      return continue_state;
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      FAIL_IF( parse_number(false) );
-      return continue_state;
-    case '-':
-      FAIL_IF( parse_number(true) );
-      return continue_state;
-    case '{':
-      FAIL_IF( start_object(continue_state) );
-      return addresses.object_begin;
-    case '[':
-      FAIL_IF( start_array(continue_state) );
-      return addresses.array_begin;
-    default:
-      return addresses.error;
-    }
-  }
+      template <>
+      really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block);
+        simd::simd8x64<uint8_t> in_2(block + 64);
+        json_block block_1 = scanner.next(in_1);
+        json_block block_2 = scanner.next(in_2);
+        this->next(in_1, block_1, reader.block_index());
+        this->next(in_2, block_2, reader.block_index() + 64);
+        reader.advance();
+      }
 
-  WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
+      template <>
+      really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block);
+        json_block block_1 = scanner.next(in_1);
+        this->next(in_1, block_1, reader.block_index());
+        reader.advance();
+      }
 
-    return doc_parser.on_success(SUCCESS);
-  }
+      really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx)
+      {
+        uint64_t unescaped = in.lteq(0x1F);
+        checker.check_next_input(in);
+        indexer.write(uint32_t(idx - 64), prev_structurals); // Output *last* iteration's structurals to the parser
+        prev_structurals = block.structural_start();
+        unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+      }
 
-  WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
-    * pessimistically.
-    * doc_parser.is_valid  = false;
-    * At this point in the code, we have all the time in the world.
-    * Note that we know exactly where we are in the document so we could,
-    * without any overhead on the processing code, report a specific
-    * location.
-    * We could even trigger special code paths to assess what happened
-    * carefully,
-    * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    switch (structurals.current_char()) {
-    case '"':
-      return doc_parser.on_error(STRING_ERROR);
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-    case '-':
-      return doc_parser.on_error(NUMBER_ERROR);
-    case 't':
-      return doc_parser.on_error(T_ATOM_ERROR);
-    case 'n':
-      return doc_parser.on_error(N_ATOM_ERROR);
-    case 'f':
-      return doc_parser.on_error(F_ATOM_ERROR);
-    default:
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-  }
+      really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial)
+      {
+        // Write out the final iteration's structurals
+        indexer.write(uint32_t(idx - 64), prev_structurals);
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
-    }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_state)) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+        error_code error = scanner.finish(partial);
+        if (unlikely(error != SUCCESS))
+        {
+          return error;
+        }
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
-  }
-};
+        if (unescaped_chars_error)
+        {
+          return UNESCAPED_CHARS;
+        }
 
-// Redefine FAIL_IF to use goto since it'll be used inside the function now
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
+        parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+        /***
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   **/
+        parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+        parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+        parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+        parser.next_structural_index = 0;
+        // a valid JSON file cannot have zero structural indexes - we should have found something
+        if (unlikely(parser.n_structural_indexes == 0u))
+        {
+          return EMPTY;
+        }
+        if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len))
+        {
+          return UNEXPECTED_ERROR;
+        }
+        if (partial)
+        {
+          auto new_structural_indexes = find_next_document_index(parser);
+          if (new_structural_indexes == 0 && parser.n_structural_indexes > 0)
+          {
+            return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+          }
+          parser.n_structural_indexes = new_structural_indexes;
+        }
+        return checker.errors();
+      }
 
-} // namespace stage2
+    } // namespace stage1
+    /* end file src/generic/stage1/json_structural_indexer.h */
+    WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept
+    {
+      this->buf = _buf;
+      this->len = _len;
+      return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
+    }
+    /* begin file src/generic/stage1/utf8_validator.h */
+    namespace stage1
+    {
+      /**
+ * Validates that the string is actual UTF-8.
+ */
+      template <class checker>
+      bool generic_validate_utf8(const uint8_t *input, size_t length)
+      {
+        checker c{};
+        buf_block_reader<64> reader(input, length);
+        while (reader.has_full_block())
+        {
+          simd::simd8x64<uint8_t> in(reader.full_block());
+          c.check_next_input(in);
+          reader.advance();
+        }
+        uint8_t block[64]{};
+        reader.get_remainder(block);
+        simd::simd8x64<uint8_t> in(block);
+        c.check_next_input(in);
+        reader.advance();
+        return c.errors() == error_code::SUCCESS;
+      }
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
+      bool generic_validate_utf8(const char *input, size_t length)
+      {
+        return generic_validate_utf8<utf8_checker>((const uint8_t *)input, length);
+      }
 
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+    } // namespace stage1
+    /* end file src/generic/stage1/utf8_validator.h */
+    WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept
+    {
+      return simdjson::haswell::stage1::generic_validate_utf8(buf, len);
+    }
+  } // namespace haswell
+} // namespace simdjson
+UNTARGET_REGION
 
 //
-// Object parser states
+// Stage 2
 //
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+/* begin file src/haswell/stringparsing.h */
+#ifndef SIMDJSON_HASWELL_STRINGPARSING_H
+#define SIMDJSON_HASWELL_STRINGPARSING_H
 
-object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+/* jsoncharutils.h already included: #include "jsoncharutils.h" */
+/* haswell/simd.h already included: #include "haswell/simd.h" */
+/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
+/* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+TARGET_HASWELL
+namespace simdjson
+{
+  namespace haswell
+  {
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+    using namespace simd;
 
-//
-// Array parser states
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
+    // Holds backslashes and quotes locations.
+    struct backslash_and_quote
+    {
+    public:
+      static constexpr uint32_t BYTES_PROCESSED = 32;
+      really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+      really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
+      really_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
+      really_inline int quote_index() { return trailing_zeroes(quote_bits); }
+      really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+      uint32_t bs_bits;
+      uint32_t quote_bits;
+    }; // struct backslash_and_quote
 
-finish:
-  return parser.finish();
+    really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst)
+    {
+      // this can read up to 15 bytes beyond the buffer size, but we require
+      // SIMDJSON_PADDING of padding
+      static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
+      simd8<uint8_t> v(src);
+      // store to dest unconditionally - we can overwrite the bits we don't like later
+      v.store(dst);
+      return {
+          (uint32_t)(v == '\\').to_bitmask(), // bs_bits
+          (uint32_t)(v == '"').to_bitmask(),  // quote_bits
+      };
+    }
 
-error:
-  return parser.error();
-}
+    /* begin file src/generic/stage2/stringparsing.h */
+    // This file contains the common code every implementation uses
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "stringparsing.h" (this simplifies amalgation)
 
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
-  }
-  return code;
-}
-/* end file src/generic/stage2_build_tape.h */
-/* begin file src/generic/stage2_streaming_build_tape.h */
-namespace stage2 {
+    namespace stage2
+    {
+      namespace stringparsing
+      {
 
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, parser &_doc_parser, size_t _i) : structural_parser(_buf, _len, _doc_parser, _i) {}
+        // begin copypasta
+        // These chars yield themselves: " \ /
+        // b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+        // u not handled in this table as it's complex
+        static const uint8_t escape_map[256] = {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x0.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x22,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x2f,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x4.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x5c,
+            0,
+            0,
+            0, // 0x5.
+            0,
+            0,
+            0x08,
+            0,
+            0,
+            0,
+            0x0c,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x0a,
+            0, // 0x6.
+            0,
+            0,
+            0x0d,
+            0,
+            0x09,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x7.
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return doc_parser.on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
-  }
-};
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
 
-} // namespace stage2
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        };
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, next_json);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+        // handle a unicode codepoint
+        // write appropriate values into dest
+        // src will advance 6 bytes or 12 bytes
+        // dest will advance a variable amount (return via pointer)
+        // return true if the unicode codepoint was valid
+        // We work in little-endian then swap at write time
+        WARN_UNUSED
+        really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                                    uint8_t **dst_ptr)
+        {
+          // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+          // conversion isn't valid; we defer the check for this to inside the
+          // multilingual plane check
+          uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
+          *src_ptr += 6;
+          // check for low surrogate for characters outside the Basic
+          // Multilingual Plane.
+          if (code_point >= 0xd800 && code_point < 0xdc00)
+          {
+            if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u')
+            {
+              return false;
+            }
+            uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
 
-//
-// Object parser parsers
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+            // if the first code point is invalid we will get here, as we will go past
+            // the check for being outside the Basic Multilingual plane. If we don't
+            // find a \u immediately afterwards we fail out anyhow, but if we do,
+            // this check catches both the case of the first code point being invalid
+            // or the second code point being invalid.
+            if ((code_point | code_point_2) >> 16)
+            {
+              return false;
+            }
 
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+            code_point =
+                (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
+            *src_ptr += 6;
+          }
+          size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
+          *dst_ptr += offset;
+          return offset > 0;
+        }
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+        WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst)
+        {
+          src++;
+          while (1)
+          {
+            // Copy the next n bytes, and find the backslash and quote in them.
+            auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+            // If the next thing is the end quote, copy and return
+            if (bs_quote.has_quote_first())
+            {
+              // we encountered quotes first. Move dst to point to quotes and exit
+              return dst + bs_quote.quote_index();
+            }
+            if (bs_quote.has_backslash())
+            {
+              /* find out where the backspace is */
+              auto bs_dist = bs_quote.backslash_index();
+              uint8_t escape_char = src[bs_dist + 1];
+              /* we encountered backslash first. Handle backslash */
+              if (escape_char == 'u')
+              {
+                /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+                src += bs_dist;
+                dst += bs_dist;
+                if (!handle_unicode_codepoint(&src, &dst))
+                {
+                  return nullptr;
+                }
+              }
+              else
+              {
+                /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+                uint8_t escape_result = escape_map[escape_char];
+                if (escape_result == 0u)
+                {
+                  return nullptr; /* bogus escape value is an error */
+                }
+                dst[bs_dist] = escape_result;
+                src += bs_dist + 2;
+                dst += bs_dist + 1;
+              }
+            }
+            else
+            {
+              /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+              src += backslash_and_quote::BYTES_PROCESSED;
+              dst += backslash_and_quote::BYTES_PROCESSED;
+            }
+          }
+          /* can't be reached */
+          return nullptr;
+        }
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+      } // namespace stringparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/stringparsing.h */
 
-//
-// Array parser parsers
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
+  } // namespace haswell
+} // namespace simdjson
+UNTARGET_REGION
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+#endif // SIMDJSON_HASWELL_STRINGPARSING_H
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/haswell/numberparsing.h */
+#ifndef SIMDJSON_HASWELL_NUMBERPARSING_H
+#define SIMDJSON_HASWELL_NUMBERPARSING_H
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+/* jsoncharutils.h already included: #include "jsoncharutils.h" */
+/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
+/* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
+#include <cmath>
+#include <limits>
 
-finish:
-  next_json = parser.structurals.next_structural_index();
-  return parser.finish();
+#ifdef JSON_TEST_NUMBERS // for unit testing
+void found_invalid_number(const uint8_t *buf);
+void found_integer(int64_t result, const uint8_t *buf);
+void found_unsigned_integer(uint64_t result, const uint8_t *buf);
+void found_float(double result, const uint8_t *buf);
+#endif
 
-error:
-  return parser.error();
-}
-/* end file src/generic/stage2_streaming_build_tape.h */
+TARGET_HASWELL
+namespace simdjson
+{
+  namespace haswell
+  {
+    static inline uint32_t parse_eight_digits_unrolled(const char *chars)
+    {
+      // this actually computes *16* values so we are being wasteful.
+      const __m128i ascii0 = _mm_set1_epi8('0');
+      const __m128i mul_1_10 =
+          _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
+      const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
+      const __m128i mul_1_10000 =
+          _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
+      const __m128i input = _mm_sub_epi8(
+          _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
+      const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
+      const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
+      const __m128i t3 = _mm_packus_epi32(t2, t2);
+      const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
+      return _mm_cvtsi128_si32(
+          t4); // only captures the sum of the first 8 digits, drop the rest
+    }
 
-} // namespace simdjson::arm64
+#define SWAR_NUMBER_PARSING
 
-#endif // SIMDJSON_ARM64_STAGE2_BUILD_TAPE_H
-/* end file src/generic/stage2_streaming_build_tape.h */
+    /* begin file src/generic/stage2/numberparsing.h */
+    namespace stage2
+    {
+      namespace numberparsing
+      {
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), writer.append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), writer.append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), writer.append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) writer.append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) writer.append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) writer.append_double((VALUE))
 #endif
-#if SIMDJSON_IMPLEMENTATION_FALLBACK
-/* begin file src/fallback/stage2_build_tape.h */
-#ifndef SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H
-#define SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H
 
+        // Attempts to compute i * 10^(power) exactly; and if "negative" is
+        // true, negate the result.
+        // This function will only work in some cases, when it does not work, success is
+        // set to false. This should work *most of the time* (like 99% of the time).
+        // We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
+        // FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
+        really_inline double compute_float_64(int64_t power, uint64_t i, bool negative, bool *success)
+        {
+          // we start with a fast path
+          // It was described in
+          // Clinger WD. How to read floating point numbers accurately.
+          // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+          // We cannot be certain that x/y is rounded to nearest.
+          if (0 <= power && power <= 22 && i <= 9007199254740991)
+          {
+#else
+          if (-22 <= power && power <= 22 && i <= 9007199254740991)
+          {
+#endif
+            // convert the integer into a double. This is lossless since
+            // 0 <= i <= 2^53 - 1.
+            double d = double(i);
+            //
+            // The general idea is as follows.
+            // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+            // 1) Both s and p can be represented exactly as 64-bit floating-point
+            // values
+            // (binary64).
+            // 2) Because s and p can be represented exactly as floating-point values,
+            // then s * p
+            // and s / p will produce correctly rounded values.
+            //
+            if (power < 0)
+            {
+              d = d / power_of_ten[-power];
+            }
+            else
+            {
+              d = d * power_of_ten[power];
+            }
+            if (negative)
+            {
+              d = -d;
+            }
+            *success = true;
+            return d;
+          }
+          // When 22 < power && power <  22 + 16, we could
+          // hope for another, secondary fast path.  It wa
+          // described by David M. Gay in  "Correctly rounded
+          // binary-decimal and decimal-binary conversions." (1990)
+          // If you need to compute i * 10^(22 + x) for x < 16,
+          // first compute i * 10^x, if you know that result is exact
+          // (e.g., when i * 10^x < 2^53),
+          // then you can still proceed and do (i * 10^x) * 10^22.
+          // Is this worth your time?
+          // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+          // for this second fast path to work.
+          // If you you have 22 < power *and* power <  22 + 16, and then you
+          // optimistically compute "i * 10^(x-22)", there is still a chance that you
+          // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+          // this optimization maybe less common than we would like. Source:
+          // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+          // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
 
-/* fallback/implementation.h already included: #include "fallback/implementation.h" */
-/* begin file src/fallback/stringparsing.h */
-#ifndef SIMDJSON_FALLBACK_STRINGPARSING_H
-#define SIMDJSON_FALLBACK_STRINGPARSING_H
+          // The fast path has now failed, so we are failing back on the slower path.
 
-/* jsoncharutils.h already included: #include "jsoncharutils.h" */
+          // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+          // possible, except if i == 0, so we handle i == 0 separately.
+          if (i == 0)
+          {
+            return 0.0;
+          }
 
-namespace simdjson::fallback {
+          // We are going to need to do some 64-bit arithmetic to get a more precise product.
+          // We use a table lookup approach.
+          components c =
+              power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
+          // safe because
+          // power >= FASTFLOAT_SMALLEST_POWER
+          // and power <= FASTFLOAT_LARGEST_POWER
+          // we recover the mantissa of the power, it has a leading 1. It is always
+          // rounded down.
+          uint64_t factor_mantissa = c.mantissa;
 
-// Holds backslashes and quotes locations.
-struct backslash_and_quote {
-public:
-  static constexpr uint32_t BYTES_PROCESSED = 1;
-  really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+          // We want the most significant bit of i to be 1. Shift if needed.
+          int lz = leading_zeroes(i);
+          i <<= lz;
+          // We want the most significant 64 bits of the product. We know
+          // this will be non-zero because the most significant bit of i is
+          // 1.
+          value128 product = full_multiplication(i, factor_mantissa);
+          uint64_t lower = product.low;
+          uint64_t upper = product.high;
 
-  really_inline bool has_quote_first() { return c == '"'; }
-  really_inline bool has_backslash() { return c == '\\'; }
-  really_inline int quote_index() { return c == '"' ? 0 : 1; }
-  really_inline int backslash_index() { return c == '\\' ? 0 : 1; }
+          // We know that upper has at most one leading zero because
+          // both i and  factor_mantissa have a leading one. This means
+          // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
 
-  uint8_t c;
-}; // struct backslash_and_quote
+          // As long as the first 9 bits of "upper" are not "1", then we
+          // know that we have an exact computed value for the leading
+          // 55 bits because any imprecision would play out as a +1, in
+          // the worst case.
+          if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower))
+          {
+            uint64_t factor_mantissa_low =
+                mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
+            // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
+            // result (three 64-bit values)
+            product = full_multiplication(i, factor_mantissa_low);
+            uint64_t product_low = product.low;
+            uint64_t product_middle2 = product.high;
+            uint64_t product_middle1 = lower;
+            uint64_t product_high = upper;
+            uint64_t product_middle = product_middle1 + product_middle2;
+            if (product_middle < product_middle1)
+            {
+              product_high++; // overflow carry
+            }
+            // We want to check whether mantissa *i + i would affect our result.
+            // This does happen, e.g. with 7.3177701707893310e+15.
+            if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
+                 (product_low + i < product_low)))
+            { // let us be prudent and bail out.
+              *success = false;
+              return 0;
+            }
+            upper = product_high;
+            lower = product_middle;
+          }
+          // The final mantissa should be 53 bits with a leading 1.
+          // We shift it so that it occupies 54 bits with a leading 1.
+          ///////
+          uint64_t upperbit = upper >> 63;
+          uint64_t mantissa = upper >> (upperbit + 9);
+          lz += int(1 ^ upperbit);
 
-really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
-  // store to dest unconditionally - we can overwrite the bits we don't like later
-  dst[0] = src[0];
-  return { src[0] };
-}
+          // Here we have mantissa < (1<<54).
 
-/* begin file src/generic/stringparsing.h */
-// This file contains the common code every implementation uses
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "stringparsing.h" (this simplifies amalgation)
+          // We have to round to even. The "to even" part
+          // is only a problem when we are right in between two floats
+          // which we guard against.
+          // If we have lots of trailing zeros, we may fall right between two
+          // floating-point values.
+          if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
+                       ((mantissa & 3) == 1)))
+          {
+            // if mantissa & 1 == 1 we might need to round up.
+            //
+            // Scenarios:
+            // 1. We are not in the middle. Then we should round up.
+            //
+            // 2. We are right in the middle. Whether we round up depends
+            // on the last significant bit: if it is "one" then we round
+            // up (round to even) otherwise, we do not.
+            //
+            // So if the last significant bit is 1, we can safely round up.
+            // Hence we only need to bail out if (mantissa & 3) == 1.
+            // Otherwise we may need more accuracy or analysis to determine whether
+            // we are exactly between two floating-point numbers.
+            // It can be triggered with 1e23.
+            // Note: because the factor_mantissa and factor_mantissa_low are
+            // almost always rounded down (except for small positive powers),
+            // almost always should round up.
+            *success = false;
+            return 0;
+          }
 
-namespace stringparsing {
+          mantissa += mantissa & 1;
+          mantissa >>= 1;
 
-// begin copypasta
-// These chars yield themselves: " \ /
-// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
-// u not handled in this table as it's complex
-static const uint8_t escape_map[256] = {
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+          // Here we have mantissa < (1<<53), unless there was an overflow
+          if (mantissa >= (1ULL << 53))
+          {
+            //////////
+            // This will happen when parsing values such as 7.2057594037927933e+16
+            ////////
+            mantissa = (1ULL << 52);
+            lz--; // undo previous addition
+          }
+          mantissa &= ~(1ULL << 52);
+          uint64_t real_exponent = c.exp - lz;
+          // we have to check that real_exponent is in range, otherwise we bail out
+          if (unlikely((real_exponent < 1) || (real_exponent > 2046)))
+          {
+            *success = false;
+            return 0;
+          }
+          mantissa |= real_exponent << 52;
+          mantissa |= (((uint64_t)negative) << 63);
+          double d;
+          memcpy(&d, &mantissa, sizeof(d));
+          *success = true;
+          return d;
+        } // namespace numberparsing
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
-    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
-    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+        static bool parse_float_strtod(const char *ptr, double *outDouble)
+        {
+          char *endptr;
+          *outDouble = strtod(ptr, &endptr);
+          // Some libraries will set errno = ERANGE when the value is subnormal,
+          // yet we may want to be able to parse subnormal values.
+          // However, we do not want to tolerate NAN or infinite values.
+          //
+          // Values like infinity or NaN are not allowed in the JSON specification.
+          // If you consume a large value and you map it to "infinity", you will no
+          // longer be able to serialize back a standard-compliant JSON. And there is
+          // no realistic application where you might need values so large than they
+          // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+          // 10^308 It is an unimaginable large number. There will never be any piece of
+          // engineering involving as many as 10^308 parts. It is estimated that there
+          // are about 10^80 atoms in the universe.  The estimate for the total number
+          // of electrons is similar. Using a double-precision floating-point value, we
+          // can represent easily the number of atoms in the universe. We could  also
+          // represent the number of ways you can pick any three individual atoms at
+          // random in the universe. If you ever encounter a number much larger than
+          // 10^308, you know that you have a bug. RapidJSON will reject a document with
+          // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
+          // will flat out throw an exception.
+          //
+          if ((endptr == ptr) || (!std::isfinite(*outDouble)))
+          {
+            return false;
+          }
+          return true;
+        }
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+        really_inline bool is_integer(char c)
+        {
+          return (c >= '0' && c <= '9');
+          // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
+        }
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-};
+        // check quickly whether the next 8 chars are made of digits
+        // at a glance, it looks better than Mula's
+        // http://0x80.pl/articles/swar-digits-validate.html
+        really_inline bool is_made_of_eight_digits_fast(const char *chars)
+        {
+          uint64_t val;
+          // this can read up to 7 bytes beyond the buffer size, but we require
+          // SIMDJSON_PADDING of padding
+          static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+          memcpy(&val, chars, 8);
+          // a branchy method might be faster:
+          // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+          //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+          //  0x3030303030303030);
+          return (((val & 0xF0F0F0F0F0F0F0F0) |
+                   (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+                  0x3333333333333333);
+        }
 
-// handle a unicode codepoint
-// write appropriate values into dest
-// src will advance 6 bytes or 12 bytes
-// dest will advance a variable amount (return via pointer)
-// return true if the unicode codepoint was valid
-// We work in little-endian then swap at write time
-WARN_UNUSED
-really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
-                                            uint8_t **dst_ptr) {
-  // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
-  // conversion isn't valid; we defer the check for this to inside the
-  // multilingual plane check
-  uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
-  *src_ptr += 6;
-  // check for low surrogate for characters outside the Basic
-  // Multilingual Plane.
-  if (code_point >= 0xd800 && code_point < 0xdc00) {
-    if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
-      return false;
-    }
-    uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
+        template <typename W>
+        bool slow_float_parsing(UNUSED const char *src, W writer)
+        {
+          double d;
+          if (parse_float_strtod(src, &d))
+          {
+            WRITE_DOUBLE(d, (const uint8_t *)src, writer);
+            return true;
+          }
+          return INVALID_NUMBER((const uint8_t *)src);
+        }
 
-    // if the first code point is invalid we will get here, as we will go past
-    // the check for being outside the Basic Multilingual plane. If we don't
-    // find a \u immediately afterwards we fail out anyhow, but if we do,
-    // this check catches both the case of the first code point being invalid
-    // or the second code point being invalid.
-    if ((code_point | code_point_2) >> 16) {
-      return false;
-    }
+        really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent)
+        {
+          // we continue with the fiction that we have an integer. If the
+          // floating point number is representable as x * 10^z for some integer
+          // z that fits in 53 bits, then we will be able to convert back the
+          // the integer into a float in a lossless manner.
+          const char *const first_after_period = p;
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          } // There must be at least one digit after the .
 
-    code_point =
-        (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
-    *src_ptr += 6;
-  }
-  size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
-  *dst_ptr += offset;
-  return offset > 0;
-}
-
-WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
-  src++;
-  while (1) {
-    // Copy the next n bytes, and find the backslash and quote in them.
-    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
-    // If the next thing is the end quote, copy and return
-    if (bs_quote.has_quote_first()) {
-      // we encountered quotes first. Move dst to point to quotes and exit
-      return dst + bs_quote.quote_index();
-    }
-    if (bs_quote.has_backslash()) {
-      /* find out where the backspace is */
-      auto bs_dist = bs_quote.backslash_index();
-      uint8_t escape_char = src[bs_dist + 1];
-      /* we encountered backslash first. Handle backslash */
-      if (escape_char == 'u') {
-        /* move src/dst up to the start; they will be further adjusted
-           within the unicode codepoint handling code. */
-        src += bs_dist;
-        dst += bs_dist;
-        if (!handle_unicode_codepoint(&src, &dst)) {
-          return nullptr;
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          ++p;
+          i = i * 10 + digit; // might overflow + multiplication by 10 is likely
+                              // cheaper than arbitrary mult.
+          // we will handle the overflow later
+#ifdef SWAR_NUMBER_PARSING
+          // this helps if we have lots of decimals!
+          // this turns out to be frequent enough.
+          if (is_made_of_eight_digits_fast(p))
+          {
+            i = i * 100000000 + parse_eight_digits_unrolled(p);
+            p += 8;
+          }
+#endif
+          while (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            ++p;
+            i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+                                // because we have parse_highprecision_float later.
+          }
+          exponent = first_after_period - p;
+          return true;
         }
-      } else {
-        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
-         * write bs_dist+1 characters to output
-         * note this may reach beyond the part of the buffer we've actually
-         * seen. I think this is ok */
-        uint8_t escape_result = escape_map[escape_char];
-        if (escape_result == 0u) {
-          return nullptr; /* bogus escape value is an error */
-        }
-        dst[bs_dist] = escape_result;
-        src += bs_dist + 2;
-        dst += bs_dist + 1;
-      }
-    } else {
-      /* they are the same. Since they can't co-occur, it means we
-       * encountered neither. */
-      src += backslash_and_quote::BYTES_PROCESSED;
-      dst += backslash_and_quote::BYTES_PROCESSED;
-    }
-  }
-  /* can't be reached */
-  return nullptr;
-}
 
-} // namespace stringparsing
-/* end file src/generic/stringparsing.h */
+        really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent)
+        {
+          bool neg_exp = false;
+          if ('-' == *p)
+          {
+            neg_exp = true;
+            ++p;
+          }
+          else if ('+' == *p)
+          {
+            ++p;
+          }
 
-} // namespace simdjson::fallback
+          // e[+-] must be followed by a number
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          }
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          int64_t exp_number = digit;
+          p++;
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          while (is_integer(*p))
+          {
+            // we need to check for overflows; we refuse to parse this
+            if (exp_number > 0x100000000)
+            {
+              return INVALID_NUMBER(src);
+            }
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          exponent += (neg_exp ? -exp_number : exp_number);
+          return true;
+        }
 
-#endif // SIMDJSON_FALLBACK_STRINGPARSING_H
-/* end file src/generic/stringparsing.h */
-/* begin file src/fallback/numberparsing.h */
-#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_H
-#define SIMDJSON_FALLBACK_NUMBERPARSING_H
+        template <typename W>
+        really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char *start_digits, int digit_count, int64_t exponent, W &writer)
+        {
+          // If we frequently had to deal with long strings of digits,
+          // we could extend our code by using a 128-bit integer instead
+          // of a 64-bit integer. However, this is uncommon in practice.
+          // digit count is off by 1 because of the decimal (assuming there was one).
+          if (unlikely((digit_count - 1 >= 19)))
+          { // this is uncommon
+            // It is possible that the integer had an overflow.
+            // We have to handle the case where we have 0.0000somenumber.
+            const char *start = start_digits;
+            while ((*start == '0') || (*start == '.'))
+            {
+              start++;
+            }
+            // we over-decrement by one when there is a '.'
+            digit_count -= int(start - start_digits);
+            if (digit_count >= 19)
+            {
+              // Ok, chances are good that we had an overflow!
+              // this is almost never going to get called!!!
+              // we start anew, going slowly!!!
+              // This will happen in the following examples:
+              // 10000000000000000000000000000000000000000000e+308
+              // 3.1415926535897932384626433832795028841971693993751
+              //
+              bool success = slow_float_parsing((const char *)src, writer);
+              // The number was already written, but we made a copy of the writer
+              // when we passed it to the parse_large_integer() function, so
+              writer.skip_double();
+              return success;
+            }
+          }
+          // NOTE: it's weird that the unlikely() only wraps half the if, but it seems to get slower any other
+          // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+          // To future reader: we'd love if someone found a better way, or at least could explain this result!
+          if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER))
+          {
+            // this is almost never going to get called!!!
+            // we start anew, going slowly!!!
+            bool success = slow_float_parsing((const char *)src, writer);
+            // The number was already written, but we made a copy of the writer when we passed it to the
+            // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+            writer.skip_double();
+            return success;
+          }
+          bool success = true;
+          double d = compute_float_64(exponent, i, negative, &success);
+          if (!success)
+          {
+            // we are almost never going to get here.
+            if (!parse_float_strtod((const char *)src, &d))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          WRITE_DOUBLE(d, src, writer);
+          return true;
+        }
 
-/* jsoncharutils.h already included: #include "jsoncharutils.h" */
-/* begin file src/fallback/bitmanipulation.h */
-#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H
-#define SIMDJSON_FALLBACK_BITMANIPULATION_H
+        // parse the number at src
+        // define JSON_TEST_NUMBERS for unit testing
+        //
+        // It is assumed that the number is followed by a structural ({,},],[) character
+        // or a white space character. If that is not the case (e.g., when the JSON
+        // document is made of a single number), then it is necessary to copy the
+        // content and append a space before calling this function.
+        //
+        // Our objective is accurate parsing (ULP of 0) at high speed.
+        template <typename W>
+        really_inline bool parse_number(UNUSED const uint8_t *const src,
+                                        UNUSED bool found_minus,
+                                        W &writer)
+        {
+#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes \
+                                  // useful to skip parsing
+          writer.append_s64(0);   // always write zero
+          return true;            // always succeeds
+#else
+          const char *p = reinterpret_cast<const char *>(src);
+          bool negative = false;
+          if (found_minus)
+          {
+            ++p;
+            negative = true;
+            // a negative sign must be followed by an integer
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          const char *const start_digits = p;
 
-#include <limits>
+          uint64_t i; // an unsigned int avoids signed overflows (which are bad)
+          if (*p == '0')
+          {
+            ++p;
+            if (is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // 0 cannot be followed by an integer
+            i = 0;
+          }
+          else
+          {
+            // NOTE: This is a redundant check--either we're negative, in which case we checked whether this
+            // is a digit above, or the caller already determined we start with a digit. But removing this
+            // check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
+            // Please do try yourself, or think of ways to explain it--we'd love to understand :)
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // must start with an integer
+            unsigned char digit = static_cast<unsigned char>(*p - '0');
+            i = digit;
+            p++;
+            // the is_made_of_eight_digits_fast routine is unlikely to help here because
+            // we rarely see large integer parts like 123456789
+            while (is_integer(*p))
+            {
+              digit = static_cast<unsigned char>(*p - '0');
+              // a multiplication by 10 is cheaper than an arbitrary integer
+              // multiplication
+              i = 10 * i + digit; // might overflow, we will handle the overflow later
+              ++p;
+            }
+          }
 
-namespace simdjson::fallback {
+          //
+          // Handle floats if there is a . or e (or both)
+          //
+          int64_t exponent = 0;
+          bool is_float = false;
+          if ('.' == *p)
+          {
+            is_float = true;
+            ++p;
+            if (!parse_decimal(src, p, i, exponent))
+            {
+              return false;
+            }
+          }
+          int digit_count = int(p - start_digits); // used later to guard against overflows
+          if (('e' == *p) || ('E' == *p))
+          {
+            is_float = true;
+            ++p;
+            if (!parse_exponent(src, p, exponent))
+            {
+              return false;
+            }
+          }
+          if (is_float)
+          {
+            return write_float(src, negative, i, start_digits, digit_count, exponent, writer);
+          }
 
-#ifndef _MSC_VER
-// We sometimes call trailing_zero on inputs that are zero,
-// but the algorithms do not end up using the returned value.
-// Sadly, sanitizers are not smart enough to figure it out. 
-__attribute__((no_sanitize("undefined"))) // this is deliberate
-#endif // _MSC_VER
-/* result might be undefined when input_num is zero */
-really_inline int trailing_zeroes(uint64_t input_num) {
+          // The longest negative 64-bit number is 19 digits.
+          // The longest positive 64-bit number is 20 digits.
+          // We do it this way so we don't trigger this branch unless we must.
+          int longest_digit_count = negative ? 19 : 20;
+          if (digit_count > longest_digit_count)
+          {
+            return INVALID_NUMBER(src);
+          }
+          if (digit_count == longest_digit_count)
+          {
+            // Anything negative above INT64_MAX is either invalid or INT64_MIN.
+            if (negative && i > uint64_t(INT64_MAX))
+            {
+              // If the number is negative and can't fit in a signed integer, it's invalid.
+              if (i > uint64_t(INT64_MAX) + 1)
+              {
+                return INVALID_NUMBER(src);
+              }
 
-#ifdef _MSC_VER
-  unsigned long ret;
-  // Search the mask data from least significant bit (LSB) 
-  // to the most significant bit (MSB) for a set bit (1).
-  _BitScanForward64(&ret, input_num);
-  return (int)ret;
-#else
-  return __builtin_ctzll(input_num);
-#endif // _MSC_VER
+              // If it's negative, it has to be INT64_MAX+1 now (or INT64_MIN).
+              // C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
+              WRITE_INTEGER(INT64_MIN, src, writer);
+              return is_structural_or_whitespace(*p);
+            }
 
-} // namespace simdjson::arm64
+            // Positive overflow check:
+            // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+            //   biggest uint64_t.
+            // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+            //   If we got here, it's a 20 digit number starting with the digit "1".
+            // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+            //   than 1,553,255,926,290,448,384.
+            // - That is smaller than the smallest possible 20-digit number the user could write:
+            //   10,000,000,000,000,000,000.
+            // - Therefore, if the number is positive and lower than that, it's overflow.
+            // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
+            //
+            if (!negative && (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
 
-/* result might be undefined when input_num is zero */
-really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
-  return input_num & (input_num-1);
-}
+          // Write unsigned if it doesn't fit in a signed integer.
+          if (i > uint64_t(INT64_MAX))
+          {
+            WRITE_UNSIGNED(i, src, writer);
+          }
+          else
+          {
+            WRITE_INTEGER(negative ? 0 - i : i, src, writer);
+          }
+          return is_structural_or_whitespace(*p);
 
-/* result might be undefined when input_num is zero */
-really_inline int leading_zeroes(uint64_t input_num) {
-#ifdef _MSC_VER
-  unsigned long leading_zero = 0;
-  // Search the mask data from most significant bit (MSB) 
-  // to least significant bit (LSB) for a set bit (1).
-  if (_BitScanReverse64(&leading_zero, input_num))
-    return (int)(63 - leading_zero);
-  else
-    return 64;
-#else
-  return __builtin_clzll(input_num);
-#endif// _MSC_VER
-}
+#endif // SIMDJSON_SKIPNUMBERPARSING
+        }
 
-really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
-  *result = value1 + value2;
-  return *result < value1;
-}
+      } // namespace numberparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/numberparsing.h */
 
-really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
-  *result = value1 * value2;
-  // TODO there must be a faster way
-  return value2 > 0 && value1 > std::numeric_limits<uint64_t>::max() / value2;
-}
+  } // namespace haswell
 
-} // namespace simdjson::fallback
+} // namespace simdjson
+UNTARGET_REGION
 
-#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H
-/* end file src/fallback/bitmanipulation.h */
-#include <cmath>
-#include <limits>
+#endif // SIMDJSON_HASWELL_NUMBERPARSING_H
+/* end file src/generic/stage2/numberparsing.h */
 
-#ifdef JSON_TEST_NUMBERS // for unit testing
-void found_invalid_number(const uint8_t *buf);
-void found_integer(int64_t result, const uint8_t *buf);
-void found_unsigned_integer(uint64_t result, const uint8_t *buf);
-void found_float(double result, const uint8_t *buf);
-#endif
+TARGET_HASWELL
+namespace simdjson
+{
+  namespace haswell
+  {
 
-namespace simdjson::fallback {
-static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
-  uint32_t result = 0;
-  for (int i=0;i<8;i++) {
-    result = result*10 + (chars[i] - '0');
-  }
-  return result;
-}
+    /* begin file src/generic/stage2/logger.h */
+    // This is for an internal-only stage 2 specific logger.
+    // Set LOG_ENABLED = true to log what stage 2 is doing!
+    namespace logger
+    {
+      static constexpr const char *DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
 
-#define SWAR_NUMBER_PARSING
+      static constexpr const bool LOG_ENABLED = false;
+      static constexpr const int LOG_EVENT_LEN = 30;
+      static constexpr const int LOG_BUFFER_LEN = 20;
+      static constexpr const int LOG_DETAIL_LEN = 50;
+      static constexpr const int LOG_INDEX_LEN = 10;
 
-/* begin file src/generic/numberparsing.h */
-namespace numberparsing {
+      static int log_depth; // Not threadsafe. Log only.
 
+      // Helper to turn unprintable or newline characters into spaces
+      static really_inline char printable_char(char c)
+      {
+        if (c >= 0x20)
+        {
+          return c;
+        }
+        else
+        {
+          return ' ';
+        }
+      }
 
-// Attempts to compute i * 10^(power) exactly; and if "negative" is
-// true, negate the result.
-// This function will only work in some cases, when it does not work, success is
-// set to false. This should work *most of the time* (like 99% of the time).
-// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
-// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
-really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
-                                      bool *success) {
-  // we start with a fast path
-  // It was described in
-  // Clinger WD. How to read floating point numbers accurately.
-  // ACM SIGPLAN Notices. 1990
-  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
-    // convert the integer into a double. This is lossless since
-    // 0 <= i <= 2^53 - 1.
-    double d = i;
-    //
-    // The general idea is as follows.
-    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
-    // 1) Both s and p can be represented exactly as 64-bit floating-point
-    // values
-    // (binary64).
-    // 2) Because s and p can be represented exactly as floating-point values,
-    // then s * p
-    // and s / p will produce correctly rounded values.
-    //
-    if (power < 0) {
-      d = d / power_of_ten[-power];
-    } else {
-      d = d * power_of_ten[power];
-    }
-    if (negative) {
-      d = -d;
-    }
-    *success = true;
-    return d;
-  }
-  // When 22 < power && power <  22 + 16, we could
-  // hope for another, secondary fast path.  It wa
-  // described by David M. Gay in  "Correctly rounded
-  // binary-decimal and decimal-binary conversions." (1990)
-  // If you need to compute i * 10^(22 + x) for x < 16,
-  // first compute i * 10^x, if you know that result is exact
-  // (e.g., when i * 10^x < 2^53),
-  // then you can still proceed and do (i * 10^x) * 10^22.
-  // Is this worth your time?
-  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
-  // for this second fast path to work.
-  // If you you have 22 < power *and* power <  22 + 16, and then you
-  // optimistically compute "i * 10^(x-22)", there is still a chance that you
-  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
-  // this optimization maybe less common than we would like. Source:
-  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
-  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+      // Print the header and set up log_start
+      static really_inline void log_start()
+      {
+        if (LOG_ENABLED)
+        {
+          log_depth = 0;
+          printf("\n");
+          printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+          printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN + 2, DASHES, LOG_BUFFER_LEN + 2, DASHES, 4 + 2, DASHES, 4 + 2, DASHES, 5 + 2, DASHES, 5 + 2, DASHES, LOG_DETAIL_LEN + 2, DASHES, LOG_INDEX_LEN + 2, DASHES);
+        }
+      }
 
-  // The fast path has now failed, so we are failing back on the slower path.
+      static really_inline void log_string(const char *message)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("%s\n", message);
+        }
+      }
 
-  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
-  // possible, except if i == 0, so we handle i == 0 separately.
-  if(i == 0) {
-    return 0.0;
-  }
+      // Logs a single line of
+      template <typename S>
+      static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("| %*s%s%-*s ", log_depth * 2, "", title_prefix, LOG_EVENT_LEN - log_depth * 2 - int(strlen(title_prefix)), title);
+          {
+            // Print the next N characters in the buffer.
+            printf("| ");
+            // Otherwise, print the characters starting from the buffer position.
+            // Print spaces for unprintable or newline characters.
+            for (int i = 0; i < LOG_BUFFER_LEN; i++)
+            {
+              printf("%c", printable_char(structurals.current()[i]));
+            }
+            printf(" ");
+          }
+          printf("|    %c ", printable_char(structurals.current_char()));
+          printf("|    %c ", printable_char(structurals.peek_next_char()));
+          printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural + 1)]);
+          printf("| %5u ", structurals.next_tape_index());
+          printf("| %-*s ", LOG_DETAIL_LEN, detail);
+          printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+          printf("|\n");
+        }
+      }
+    } // namespace logger
 
-  // We are going to need to do some 64-bit arithmetic to get a more precise product.
-  // We use a table lookup approach.
-  components c =
-      power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
-      // safe because
-      // power >= FASTFLOAT_SMALLEST_POWER
-      // and power <= FASTFLOAT_LARGEST_POWER
-  // we recover the mantissa of the power, it has a leading 1. It is always
-  // rounded down.
-  uint64_t factor_mantissa = c.mantissa;
+    /* end file src/generic/stage2/logger.h */
+    /* begin file src/generic/stage2/atomparsing.h */
+    namespace stage2
+    {
+      namespace atomparsing
+      {
 
-  // We want the most significant bit of i to be 1. Shift if needed.
-  int lz = leading_zeroes(i);
-  i <<= lz;
-  // We want the most significant 64 bits of the product. We know
-  // this will be non-zero because the most significant bit of i is
-  // 1.
-  value128 product = full_multiplication(i, factor_mantissa);
-  uint64_t lower = product.low;
-  uint64_t upper = product.high;
+        really_inline uint32_t string_to_uint32(const char *str) { return *reinterpret_cast<const uint32_t *>(str); }
 
-  // We know that upper has at most one leading zero because
-  // both i and  factor_mantissa have a leading one. This means
-  // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
+        WARN_UNUSED
+        really_inline uint32_t str4ncmp(const uint8_t *src, const char *atom)
+        {
+          uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+          static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+          std::memcpy(&srcval, src, sizeof(uint32_t));
+          return srcval ^ string_to_uint32(atom);
+        }
 
-  // As long as the first 9 bits of "upper" are not "1", then we
-  // know that we have an exact computed value for the leading
-  // 55 bits because any imprecision would play out as a +1, in
-  // the worst case.
-  if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower)) {
-    uint64_t factor_mantissa_low =
-        mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
-    // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
-    // result (three 64-bit values)
-    product = full_multiplication(i, factor_mantissa_low);
-    uint64_t product_low = product.low;
-    uint64_t product_middle2 = product.high;
-    uint64_t product_middle1 = lower;
-    uint64_t product_high = upper;
-    uint64_t product_middle = product_middle1 + product_middle2;
-    if (product_middle < product_middle1) {
-      product_high++; // overflow carry
-    }
-    // We want to check whether mantissa *i + i would affect our result.
-    // This does happen, e.g. with 7.3177701707893310e+15.
-    if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
-         (product_low + i < product_low))) { // let us be prudent and bail out.
-      *success = false;
-      return 0;
-    }
-    upper = product_high;
-    lower = product_middle;
-  }
-  // The final mantissa should be 53 bits with a leading 1.
-  // We shift it so that it occupies 54 bits with a leading 1.
-  ///////
-  uint64_t upperbit = upper >> 63;
-  uint64_t mantissa = upper >> (upperbit + 9);
-  lz += 1 ^ upperbit;
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
 
-  // Here we have mantissa < (1<<54).
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_true_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "true");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-  // We have to round to even. The "to even" part
-  // is only a problem when we are right in between two floats
-  // which we guard against.
-  // If we have lots of trailing zeros, we may fall right between two
-  // floating-point values.
-  if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
-               ((mantissa & 3) == 1))) {
-      // if mantissa & 1 == 1 we might need to round up.
-      //
-      // Scenarios:
-      // 1. We are not in the middle. Then we should round up.
-      //
-      // 2. We are right in the middle. Whether we round up depends
-      // on the last significant bit: if it is "one" then we round
-      // up (round to even) otherwise, we do not.
-      //
-      // So if the last significant bit is 1, we can safely round up.
-      // Hence we only need to bail out if (mantissa & 3) == 1.
-      // Otherwise we may need more accuracy or analysis to determine whether
-      // we are exactly between two floating-point numbers.
-      // It can be triggered with 1e23.
-      // Note: because the factor_mantissa and factor_mantissa_low are
-      // almost always rounded down (except for small positive powers),
-      // almost always should round up.
-      *success = false;
-      return 0;
-  }
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src + 1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
+        }
 
-  mantissa += mantissa & 1;
-  mantissa >>= 1;
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 5)
+          {
+            return is_valid_false_atom(src);
+          }
+          else if (len == 5)
+          {
+            return !str4ncmp(src + 1, "alse");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-  // Here we have mantissa < (1<<53), unless there was an overflow
-  if (mantissa >= (1ULL << 53)) {
-    //////////
-    // This will happen when parsing values such as 7.2057594037927933e+16
-    ////////
-    mantissa = (1ULL << 52);
-    lz--; // undo previous addition
-  }
-  mantissa &= ~(1ULL << 52);
-  uint64_t real_exponent = c.exp - lz;
-  // we have to check that real_exponent is in range, otherwise we bail out
-  if (unlikely((real_exponent < 1) || (real_exponent > 2046))) {
-    *success = false;
-    return 0;
-  }
-  mantissa |= real_exponent << 52;
-  mantissa |= (((uint64_t)negative) << 63);
-  double d;
-  memcpy(&d, &mantissa, sizeof(d));
-  *success = true;
-  return d;
-}
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
 
-static bool parse_float_strtod(const char *ptr, double *outDouble) {
-  char *endptr;
-  *outDouble = strtod(ptr, &endptr);
-  // Some libraries will set errno = ERANGE when the value is subnormal,
-  // yet we may want to be able to parse subnormal values.
-  // However, we do not want to tolerate NAN or infinite values.
-  //
-  // Values like infinity or NaN are not allowed in the JSON specification.
-  // If you consume a large value and you map it to "infinity", you will no
-  // longer be able to serialize back a standard-compliant JSON. And there is
-  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
-  // 10^308 It is an unimaginable large number. There will never be any piece of
-  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
-  // of electrons is similar. Using a double-precision floating-point value, we
-  // can represent easily the number of atoms in the universe. We could  also
-  // represent the number of ways you can pick any three individual atoms at
-  // random in the universe. If you ever encounter a number much larger than
-  // 10^308, you know that you have a bug. RapidJSON will reject a document with
-  // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
-  // will flat out throw an exception.
-  //
-  if ((endptr == ptr) || (!std::isfinite(*outDouble))) {
-    return false;
-  }
-  return true;
-}
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_null_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "null");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-really_inline bool is_integer(char c) {
-  return (c >= '0' && c <= '9');
-  // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
-}
+      } // namespace atomparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/atomparsing.h */
+    /* begin file src/generic/stage2/structural_iterator.h */
+    namespace stage2
+    {
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+      class structural_iterator
+      {
+      public:
+        const uint8_t *const buf;
+        uint32_t *current_structural;
+        dom_parser_implementation &parser;
 
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
+        // Start a structural
+        really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+            : buf{_parser.buf},
+              current_structural{&_parser.structural_indexes[start_structural_index]},
+              parser{_parser}
+        {
+        }
+        // Get the buffer position of the current structural character
+        really_inline const uint8_t *current()
+        {
+          return &buf[*current_structural];
+        }
+        // Get the current structural character
+        really_inline char current_char()
+        {
+          return buf[*current_structural];
+        }
+        // Get the next structural character without advancing
+        really_inline char peek_next_char()
+        {
+          return buf[*(current_structural + 1)];
+        }
+        really_inline char advance_char()
+        {
+          current_structural++;
+          return buf[*current_structural];
+        }
+        really_inline size_t remaining_len()
+        {
+          return parser.len - *current_structural;
+        }
 
-// check quickly whether the next 8 chars are made of digits
-// at a glance, it looks better than Mula's
-// http://0x80.pl/articles/swar-digits-validate.html
-really_inline bool is_made_of_eight_digits_fast(const char *chars) {
-  uint64_t val;
-  // this can read up to 7 bytes beyond the buffer size, but we require
-  // SIMDJSON_PADDING of padding
-  static_assert(7 <= SIMDJSON_PADDING);
-  memcpy(&val, chars, 8);
-  // a branchy method might be faster:
-  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
-  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
-  //  0x3030303030303030);
-  return (((val & 0xF0F0F0F0F0F0F0F0) |
-           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
-          0x3333333333333333);
-}
+        really_inline bool past_end(uint32_t n_structural_indexes)
+        {
+          return current_structural >= &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_end(uint32_t n_structural_indexes)
+        {
+          return current_structural == &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_beginning()
+        {
+          return current_structural == parser.structural_indexes.get();
+        }
+      };
 
-// called by parse_number when we know that the output is an integer,
-// but where there might be some integer overflow.
-// we want to catch overflows!
-// Do not call this function directly as it skips some of the checks from
-// parse_number
-//
-// This function will almost never be called!!!
-//
-never_inline bool parse_large_integer(const uint8_t *const src,
-                                      parser &parser,
-                                      bool found_minus) {
-  const char *p = reinterpret_cast<const char *>(src);
+    } // namespace stage2
+    /* end file src/generic/stage2/structural_iterator.h */
+    /* begin file src/generic/stage2/structural_parser.h */
+    // This file contains the common code every implementation uses for stage2
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "simdjson/stage2.h" (this simplifies amalgation)
 
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-  }
-  uint64_t i;
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    i = 0;
-  } else {
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      if (mul_overflow(i, 10, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false; // overflow
-      }
-      if (add_overflow(i, digit, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false; // overflow
-      }
-      ++p;
-    }
-  }
-  if (negative) {
-    if (i > 0x8000000000000000) {
-      // overflows!
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false; // overflow
-    } else if (i == 0x8000000000000000) {
-      // In two's complement, we cannot represent 0x8000000000000000
-      // as a positive signed integer, but the negative version is
-      // possible.
-      constexpr int64_t signed_answer = INT64_MIN;
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    } else {
-      // we can negate safely
-      int64_t signed_answer = -static_cast<int64_t>(i);
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    }
-  } else {
-    // we have a positive integer, the contract is that
-    // we try to represent it as a signed integer and only
-    // fallback on unsigned integers if absolutely necessary.
-    if (i < 0x8000000000000000) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(i, src);
-#endif
-      parser.on_number_s64(i);
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_unsigned_integer(i, src);
-#endif
-      parser.on_number_u64(i);
-    }
-  }
-  return is_structural_or_whitespace(*p);
-}
+    namespace stage2
+    {
+      namespace
+      { // Make everything here private
 
-bool slow_float_parsing(UNUSED const char * src, parser &parser) {
-  double d;
-  if (parse_float_strtod(src, &d)) {
-    parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_float(d, (const uint8_t *)src);
-#endif
-    return true;
-  }
-#ifdef JSON_TEST_NUMBERS // for unit testing
-  found_invalid_number((const uint8_t *)src);
-#endif
-  return false;
-}
+        /* begin file src/generic/stage2/tape_writer.h */
+        struct tape_writer
+        {
+          /** The next place to write to tape */
+          uint64_t *next_tape_loc;
 
-// parse the number at src
-// define JSON_TEST_NUMBERS for unit testing
-//
-// It is assumed that the number is followed by a structural ({,},],[) character
-// or a white space character. If that is not the case (e.g., when the JSON
-// document is made of a single number), then it is necessary to copy the
-// content and append a space before calling this function.
-//
-// Our objective is accurate parsing (ULP of 0) at high speed.
-really_inline bool parse_number(UNUSED const uint8_t *const src,
-                                UNUSED bool found_minus,
-                                parser &parser) {
-#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
-                                  // useful to skip parsing
-  parser.on_number_s64(0);        // always write zero
-  return true;                    // always succeeds
-#else
-  const char *p = reinterpret_cast<const char *>(src);
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-    if (!is_integer(*p)) { // a negative sign must be followed by an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-  }
-  const char *const start_digits = p;
+          /** Write a signed 64-bit value to tape. */
+          really_inline void append_s64(int64_t value) noexcept;
 
-  uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    i = 0;
-  } else {
-    if (!(is_integer(*p))) { // must start with an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      // a multiplication by 10 is cheaper than an arbitrary integer
-      // multiplication
-      i = 10 * i + digit; // might overflow, we will handle the overflow later
-      ++p;
-    }
-  }
-  int64_t exponent = 0;
-  bool is_float = false;
-  if ('.' == *p) {
-    is_float = true; // At this point we know that we have a float
-    // we continue with the fiction that we have an integer. If the
-    // floating point number is representable as x * 10^z for some integer
-    // z that fits in 53 bits, then we will be able to convert back the
-    // the integer into a float in a lossless manner.
-    ++p;
-    const char *const first_after_period = p;
-    if (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // might overflow + multiplication by 10 is likely
-                          // cheaper than arbitrary mult.
-      // we will handle the overflow later
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-#ifdef SWAR_NUMBER_PARSING
-    // this helps if we have lots of decimals!
-    // this turns out to be frequent enough.
-    if (is_made_of_eight_digits_fast(p)) {
-      i = i * 100000000 + parse_eight_digits_unrolled(p);
-      p += 8;
-    }
-#endif
-    while (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
-                          // because we have parse_highprecision_float later.
-    }
-    exponent = first_after_period - p;
-  }
-  int digit_count =
-      p - start_digits - 1; // used later to guard against overflows
-  int64_t exp_number = 0;   // exponential part
-  if (('e' == *p) || ('E' == *p)) {
-    is_float = true;
-    ++p;
-    bool neg_exp = false;
-    if ('-' == *p) {
-      neg_exp = true;
-      ++p;
-    } else if ('+' == *p) {
-      ++p;
-    }
-    if (!is_integer(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    unsigned char digit = *p - '0';
-    exp_number = digit;
-    p++;
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    while (is_integer(*p)) {
-      if (exp_number > 0x100000000) { // we need to check for overflows
-                                      // we refuse to parse this
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false;
-      }
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    exponent += (neg_exp ? -exp_number : exp_number);
-  }
-  if (is_float) {
-    // If we frequently had to deal with long strings of digits,
-    // we could extend our code by using a 128-bit integer instead
-    // of a 64-bit integer. However, this is uncommon in practice.
-    if (unlikely((digit_count >= 19))) { // this is uncommon
-      // It is possible that the integer had an overflow.
-      // We have to handle the case where we have 0.0000somenumber.
-      const char *start = start_digits;
-      while ((*start == '0') || (*start == '.')) {
-        start++;
-      }
-      // we over-decrement by one when there is a '.'
-      digit_count -= (start - start_digits);
-      if (digit_count >= 19) {
-        // Ok, chances are good that we had an overflow!
-        // this is almost never going to get called!!!
-        // we start anew, going slowly!!!
-        // This will happen in the following examples:
-        // 10000000000000000000000000000000000000000000e+308
-        // 3.1415926535897932384626433832795028841971693993751
-        //
-        return slow_float_parsing((const char *) src, parser);
-      }
-    }
-    if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
-        (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
-      // this is almost never going to get called!!!
-      // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, parser);
-    }
-    bool success = true;
-    double d = compute_float_64(exponent, i, negative, &success);
-    if (!success) {
-      // we are almost never going to get here.
-      success = parse_float_strtod((const char *)src, &d);
-    }
-    if (success) {
-      parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_float(d, src);
-#endif
-      return true;
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-  } else {
-    if (unlikely(digit_count >= 18)) { // this is uncommon!!!
-      // there is a good chance that we had an overflow, so we need
-      // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, parser, found_minus);
-    }
-    i = negative ? 0 - i : i;
-    parser.on_number_s64(i);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_integer(i, src);
-#endif
-  }
-  return is_structural_or_whitespace(*p);
-#endif // SIMDJSON_SKIPNUMBERPARSING
-}
+          /** Write an unsigned 64-bit value to tape. */
+          really_inline void append_u64(uint64_t value) noexcept;
 
-} // namespace numberparsing
-/* end file src/generic/numberparsing.h */
+          /** Write a double value to tape. */
+          really_inline void append_double(double value) noexcept;
 
-} // namespace simdjson::fallback
+          /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+          really_inline void append(uint64_t val, internal::tape_type t) noexcept;
 
-#endif // SIMDJSON_FALLBACK_NUMBERPARSING_H
-/* end file src/generic/numberparsing.h */
+          /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+          really_inline void skip() noexcept;
 
-namespace simdjson::fallback {
+          /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+          really_inline void skip_large_integer() noexcept;
 
-/* begin file src/generic/atomparsing.h */
-namespace atomparsing {
+          /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+          really_inline void skip_double() noexcept;
 
-really_inline uint32_t string_to_uint32(const char* str) { return *reinterpret_cast<const uint32_t *>(str); }
+          /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+          really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
 
-WARN_UNUSED
-really_inline bool str4ncmp(const uint8_t *src, const char* atom) {
-  uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
-  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING);
-  std::memcpy(&srcval, src, sizeof(uint32_t));
-  return srcval ^ string_to_uint32(atom);
-}
+        private:
+          /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+          template <typename T>
+          really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+        }; // struct number_writer
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src) {
-  return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+        really_inline void tape_writer::append_s64(int64_t value) noexcept
+        {
+          append2(0, value, internal::tape_type::INT64);
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_true_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "true"); }
-  else { return false; }
-}
+        really_inline void tape_writer::append_u64(uint64_t value) noexcept
+        {
+          append(0, internal::tape_type::UINT64);
+          *next_tape_loc = value;
+          next_tape_loc++;
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src) {
-  return (str4ncmp(src+1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
-}
+        /** Write a double value to tape. */
+        really_inline void tape_writer::append_double(double value) noexcept
+        {
+          append2(0, value, internal::tape_type::DOUBLE);
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
-  if (len > 5) { return is_valid_false_atom(src); }
-  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
-  else { return false; }
-}
+        really_inline void tape_writer::skip() noexcept
+        {
+          next_tape_loc++;
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src) {
-  return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+        really_inline void tape_writer::skip_large_integer() noexcept
+        {
+          next_tape_loc += 2;
+        }
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_null_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "null"); }
-  else { return false; }
-}
+        really_inline void tape_writer::skip_double() noexcept
+        {
+          next_tape_loc += 2;
+        }
 
-} // namespace atomparsing
-/* end file src/generic/atomparsing.h */
-/* begin file src/generic/stage2_build_tape.h */
-// This file contains the common code every implementation uses for stage2
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
+        really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept
+        {
+          *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+          next_tape_loc++;
+        }
 
-namespace stage2 {
+        template <typename T>
+        really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept
+        {
+          append(val, t);
+          static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+          memcpy(next_tape_loc, &val2, sizeof(val2));
+          next_tape_loc++;
+        }
 
+        really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept
+        {
+          tape_loc = val | ((uint64_t(char(t))) << 56);
+        }
+        /* end file src/generic/stage2/tape_writer.h */
+
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address;
-#define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
-#define GOTO(address) { goto *(address); }
-#define CONTINUE(address) { goto *(address); }
-#else
-typedef char ret_address;
-#define INIT_ADDRESSES() { '[', 'a', 'e', 'f', '{', 'o' };
-#define GOTO(address)                 \
-  {                                   \
-    switch(address) {                 \
-      case '[': goto array_begin;     \
-      case 'a': goto array_continue;  \
-      case 'e': goto error;           \
-      case 'f': goto finish;          \
-      case '{': goto object_begin;    \
-      case 'o': goto object_continue; \
-    }                                 \
+#define INIT_ADDRESSES()                                                                  \
+  {                                                                                       \
+    &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue \
   }
+#define GOTO(address) \
+  {                   \
+    goto *(address);  \
+  }
+#define CONTINUE(address) \
+  {                       \
+    goto *(address);      \
+  }
+#else // SIMDJSON_USE_COMPUTED_GOTO
+#define INIT_ADDRESSES() {'[', 'a', 'e', 'f', '{', 'o'};
+#define GOTO(address)       \
+  {                         \
+    switch (address)        \
+    {                       \
+    case '[':               \
+      goto array_begin;     \
+    case 'a':               \
+      goto array_continue;  \
+    case 'e':               \
+      goto error;           \
+    case 'f':               \
+      goto finish;          \
+    case '{':               \
+      goto object_begin;    \
+    case 'o':               \
+      goto object_continue; \
+    }                       \
+  }
 // For the more constrained end_xxx() situation
-#define CONTINUE(address)             \
-  {                                   \
-    switch(address) {                 \
-      case 'a': goto array_continue;  \
-      case 'o': goto object_continue; \
-      case 'f': goto finish;          \
-    }                                 \
+#define CONTINUE(address)   \
+  {                         \
+    switch (address)        \
+    {                       \
+    case 'a':               \
+      goto array_continue;  \
+    case 'o':               \
+      goto object_continue; \
+    case 'f':               \
+      goto finish;          \
+    }                       \
   }
-#endif
+#endif // SIMDJSON_USE_COMPUTED_GOTO
 
-struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
-};
+        struct unified_machine_addresses
+        {
+          ret_address_t array_begin;
+          ret_address_t array_continue;
+          ret_address_t error;
+          ret_address_t finish;
+          ret_address_t object_begin;
+          ret_address_t object_continue;
+        };
 
 #undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
-
-class structural_iterator {
-public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf}, len{_len}, structural_indexes{_structural_indexes}, next_structural{next_structural_index} {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
+#define FAIL_IF(EXPR)         \
+  {                           \
+    if (EXPR)                 \
+    {                         \
+      return addresses.error; \
+    }                         \
   }
-  really_inline char current_char() {
-    return c;
-  }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
-  }
-  really_inline size_t remaining_len() {
-    return len - idx;
-  }
-  template<typename F>
-  really_inline bool with_space_terminated_copy(const F& f) {
-    /**
+
+        struct structural_parser : structural_iterator
+        {
+          /** Lets you append to the tape */
+          tape_writer tape;
+          /** Next write location in the string buf for stage 2 parsing */
+          uint8_t *current_string_buf_loc;
+          /** Current depth (nested objects and arrays) */
+          uint32_t depth{0};
+
+          // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+          really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+              : structural_iterator(_parser, start_structural_index),
+                tape{parser.doc->tape.get()},
+                current_string_buf_loc{parser.doc->string_buf.get()}
+          {
+          }
+
+          WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state)
+          {
+            parser.containing_scope[depth].tape_index = next_tape_index();
+            parser.containing_scope[depth].count = 0;
+            tape.skip(); // We don't actually *write* the start element until the end.
+            parser.ret_address[depth] = continue_state;
+            depth++;
+            bool exceeded_max_depth = depth >= parser.max_depth();
+            if (exceeded_max_depth)
+            {
+              log_error("Exceeded max depth!");
+            }
+            return exceeded_max_depth;
+          }
+
+          WARN_UNUSED really_inline bool start_document(ret_address_t continue_state)
+          {
+            log_start_value("document");
+            return start_scope(continue_state);
+          }
+
+          WARN_UNUSED really_inline bool start_object(ret_address_t continue_state)
+          {
+            log_start_value("object");
+            return start_scope(continue_state);
+          }
+
+          WARN_UNUSED really_inline bool start_array(ret_address_t continue_state)
+          {
+            log_start_value("array");
+            return start_scope(continue_state);
+          }
+
+          // this function is responsible for annotating the start of the scope
+          really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept
+          {
+            depth--;
+            // write our doc->tape location to the header scope
+            // The root scope gets written *at* the previous location.
+            tape.append(parser.containing_scope[depth].tape_index, end);
+            // count can overflow if it exceeds 24 bits... so we saturate
+            // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+            const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+            const uint32_t count = parser.containing_scope[depth].count;
+            const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+            // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+            tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+          }
+
+          really_inline uint32_t next_tape_index()
+          {
+            return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+          }
+
+          really_inline void end_object()
+          {
+            log_end_value("object");
+            end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+          }
+          really_inline void end_array()
+          {
+            log_end_value("array");
+            end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+          }
+          really_inline void end_document()
+          {
+            log_end_value("document");
+            end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+          }
+
+          // increment_count increments the count of keys in an object or values in an array.
+          // Note that if you are at the level of the values or elements, the count
+          // must be increment in the preceding depth (depth-1) where the array or
+          // the object resides.
+          really_inline void increment_count()
+          {
+            parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+          }
+
+          really_inline uint8_t *on_start_string() noexcept
+          {
+            // we advance the point, accounting for the fact that we have a NULL termination
+            tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+            return current_string_buf_loc + sizeof(uint32_t);
+          }
+
+          really_inline void on_end_string(uint8_t *dst) noexcept
+          {
+            uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+            // TODO check for overflow in case someone has a crazy string (>=4GB?)
+            // But only add the overflow check when the document itself exceeds 4GB
+            // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+            memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+            // NULL termination is still handy if you expect all your strings to
+            // be NULL terminated? It comes at a small cost
+            *dst = 0;
+            current_string_buf_loc = dst + 1;
+          }
+
+          WARN_UNUSED really_inline bool parse_string(bool key = false)
+          {
+            log_value(key ? "key" : "string");
+            uint8_t *dst = on_start_string();
+            dst = stringparsing::parse_string(current(), dst);
+            if (dst == nullptr)
+            {
+              log_error("Invalid escape in string");
+              return true;
+            }
+            on_end_string(dst);
+            return false;
+          }
+
+          WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus)
+          {
+            log_value("number");
+            bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+            if (!succeeded)
+            {
+              log_error("Invalid number");
+            }
+            return !succeeded;
+          }
+          WARN_UNUSED really_inline bool parse_number(bool found_minus)
+          {
+            return parse_number(current(), found_minus);
+          }
+
+          really_inline bool parse_number_with_space_terminated_copy(const bool is_negative)
+          {
+            /**
     * We need to make a copy to make sure that the string is space terminated.
     * This is not about padding the input, which should already padded up
     * to len + SIMDJSON_PADDING. However, we have no control at this stage
     * on how the padding was done. What if the input string was padded with nulls?
     * It is quite common for an input string to have an extra null character (C string).
@@ -9199,3441 +13290,4365 @@
     * pad the input with spaces when we know that there is just one input element.
     * This copy is relatively expensive, but it will almost never be called in
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if (copy == nullptr) {
-      return true;
-    }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
-    free(copy);
-    return result;
-  }
-  really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
-  }
-  really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
-  }
-  really_inline size_t next_structural_index() {
-    return next_structural;
-  }
+            uint8_t *copy = static_cast<uint8_t *>(malloc(parser.len + SIMDJSON_PADDING));
+            if (copy == nullptr)
+            {
+              return true;
+            }
+            memcpy(copy, buf, parser.len);
+            memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+            size_t idx = *current_structural;
+            bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
+            free(copy);
+            return result;
+          }
+          WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state)
+          {
+            switch (advance_char())
+            {
+            case '"':
+              FAIL_IF(parse_string());
+              return continue_state;
+            case 't':
+              log_value("true");
+              FAIL_IF(!atomparsing::is_valid_true_atom(current()));
+              tape.append(0, internal::tape_type::TRUE_VALUE);
+              return continue_state;
+            case 'f':
+              log_value("false");
+              FAIL_IF(!atomparsing::is_valid_false_atom(current()));
+              tape.append(0, internal::tape_type::FALSE_VALUE);
+              return continue_state;
+            case 'n':
+              log_value("null");
+              FAIL_IF(!atomparsing::is_valid_null_atom(current()));
+              tape.append(0, internal::tape_type::NULL_VALUE);
+              return continue_state;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+              FAIL_IF(parse_number(false));
+              return continue_state;
+            case '-':
+              FAIL_IF(parse_number(true));
+              return continue_state;
+            case '{':
+              FAIL_IF(start_object(continue_state));
+              return addresses.object_begin;
+            case '[':
+              FAIL_IF(start_array(continue_state));
+              return addresses.array_begin;
+            default:
+              log_error("Non-value found when value was expected!");
+              return addresses.error;
+            }
+          }
 
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx; // location of the structural character in the input (buf)
-  uint8_t c;  // used to track the (structural) character we are looking at
-};
+          WARN_UNUSED really_inline error_code finish()
+          {
+            end_document();
+            parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
 
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
-  uint32_t depth;
+            if (depth != 0)
+            {
+              log_error("Unclosed objects or arrays!");
+              return parser.error = TAPE_ERROR;
+            }
 
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+            return SUCCESS;
+          }
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    doc_parser.on_start_document(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+          WARN_UNUSED really_inline error_code error()
+          {
+            /* We do not need the next line because this is done by parser.init_stage2(),
+    * pessimistically.
+    * parser.is_valid  = false;
+    * At this point in the code, we have all the time in the world.
+    * Note that we know exactly where we are in the document so we could,
+    * without any overhead on the processing code, report a specific
+    * location.
+    * We could even trigger special code paths to assess what happened
+    * carefully,
+    * all without any added cost. */
+            if (depth >= parser.max_depth())
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            switch (current_char())
+            {
+            case '"':
+              return parser.error = STRING_ERROR;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '-':
+              return parser.error = NUMBER_ERROR;
+            case 't':
+              return parser.error = T_ATOM_ERROR;
+            case 'n':
+              return parser.error = N_ATOM_ERROR;
+            case 'f':
+              return parser.error = F_ATOM_ERROR;
+            default:
+              return parser.error = TAPE_ERROR;
+            }
+          }
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    doc_parser.on_start_object(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+          really_inline void init()
+          {
+            log_start();
+            parser.error = UNINITIALIZED;
+          }
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    doc_parser.on_start_array(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+          WARN_UNUSED really_inline error_code start(ret_address_t finish_state)
+          {
+            // If there are no structurals left, return EMPTY
+            if (at_end(parser.n_structural_indexes))
+            {
+              return parser.error = EMPTY;
+            }
 
-  really_inline bool end_object() {
-    depth--;
-    doc_parser.on_end_object(depth);
-    return false;
-  }
-  really_inline bool end_array() {
-    depth--;
-    doc_parser.on_end_array(depth);
-    return false;
-  }
-  really_inline bool end_document() {
-    depth--;
-    doc_parser.on_end_document(depth);
-    return false;
-  }
+            init();
+            // Push the root scope (there is always at least one scope)
+            if (start_document(finish_state))
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            return SUCCESS;
+          }
 
-  WARN_UNUSED really_inline bool parse_string() {
-    uint8_t *dst = doc_parser.on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
-    if (dst == nullptr) {
-      return true;
-    }
-    return !doc_parser.on_end_string(dst);
-  }
+          really_inline void log_value(const char *type)
+          {
+            logger::log_line(*this, "", type, "");
+          }
 
-  WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    return !numberparsing::parse_number(src, found_minus, doc_parser);
-  }
-  WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
-  }
+          static really_inline void log_start()
+          {
+            logger::log_start();
+          }
 
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
+          really_inline void log_start_value(const char *type)
+          {
+            logger::log_line(*this, "+", type, "");
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth++;
+            }
+          }
 
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
+          really_inline void log_end_value(const char *type)
+          {
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth--;
+            }
+            logger::log_line(*this, "-", type, "");
+          }
 
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
-    case '"':
-      FAIL_IF( parse_string() );
-      return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
-      return continue_state;
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      FAIL_IF( parse_number(false) );
-      return continue_state;
-    case '-':
-      FAIL_IF( parse_number(true) );
-      return continue_state;
-    case '{':
-      FAIL_IF( start_object(continue_state) );
-      return addresses.object_begin;
-    case '[':
-      FAIL_IF( start_array(continue_state) );
-      return addresses.array_begin;
-    default:
-      return addresses.error;
-    }
+          really_inline void log_error(const char *error)
+          {
+            logger::log_line(*this, "", "ERROR", error);
+          }
+        }; // struct structural_parser
+
+// Redefine FAIL_IF to use goto since it'll be used inside the function now
+#undef FAIL_IF
+#define FAIL_IF(EXPR) \
+  {                   \
+    if (EXPR)         \
+    {                 \
+      goto error;     \
+    }                 \
   }
 
-  WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
+        template <bool STREAMING>
+        WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept
+        {
+          dom_parser.doc = &doc;
+          static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+          stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+          error_code result = parser.start(addresses.finish);
+          if (result)
+          {
+            return result;
+          }
 
-    return doc_parser.on_success(SUCCESS);
-  }
+          //
+          // Read first value
+          //
+          switch (parser.current_char())
+          {
+          case '{':
+            FAIL_IF(parser.start_object(addresses.finish));
+            goto object_begin;
+          case '[':
+            FAIL_IF(parser.start_array(addresses.finish));
+            // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+            // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+            if (!STREAMING)
+            {
+              if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']')
+              {
+                goto error;
+              }
+            }
+            goto array_begin;
+          case '"':
+            FAIL_IF(parser.parse_string());
+            goto finish;
+          case 't':
+            parser.log_value("true");
+            FAIL_IF(!atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+            goto finish;
+          case 'f':
+            parser.log_value("false");
+            FAIL_IF(!atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+            goto finish;
+          case 'n':
+            parser.log_value("null");
+            FAIL_IF(!atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::NULL_VALUE);
+            goto finish;
+          case '0':
+          case '1':
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(false))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          case '-':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(true))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          default:
+            parser.log_error("Document starts with a non-value character");
+            goto error;
+          }
 
-  WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
-    * pessimistically.
-    * doc_parser.is_valid  = false;
-    * At this point in the code, we have all the time in the world.
-    * Note that we know exactly where we are in the document so we could,
-    * without any overhead on the processing code, report a specific
-    * location.
-    * We could even trigger special code paths to assess what happened
-    * carefully,
-    * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    switch (structurals.current_char()) {
-    case '"':
-      return doc_parser.on_error(STRING_ERROR);
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-    case '-':
-      return doc_parser.on_error(NUMBER_ERROR);
-    case 't':
-      return doc_parser.on_error(T_ATOM_ERROR);
-    case 'n':
-      return doc_parser.on_error(N_ATOM_ERROR);
-    case 'f':
-      return doc_parser.on_error(F_ATOM_ERROR);
-    default:
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-  }
+        //
+        // Object parser states
+        //
+        object_begin:
+          switch (parser.advance_char())
+          {
+          case '"':
+          {
+            parser.increment_count();
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          }
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("Object does not start with a key");
+            goto error;
+          }
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
-    }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_state)) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+        object_key_state:
+          if (parser.advance_char() != ':')
+          {
+            parser.log_error("Missing colon after key in object");
+            goto error;
+          }
+          GOTO(parser.parse_value(addresses, addresses.object_continue));
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
-  }
-};
+        object_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            if (parser.advance_char() != '"')
+            {
+              parser.log_error("Key string missing at beginning of field in object");
+              goto error;
+            }
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("No comma between object fields");
+            goto error;
+          }
 
-// Redefine FAIL_IF to use goto since it'll be used inside the function now
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
+        scope_end:
+          CONTINUE(parser.parser.ret_address[parser.depth]);
 
-} // namespace stage2
+        //
+        // Array parser states
+        //
+        array_begin:
+          if (parser.peek_next_char() == ']')
+          {
+            parser.advance_char();
+            parser.end_array();
+            goto scope_end;
+          }
+          parser.increment_count();
 
-/************
+        main_array_switch:
+          /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+          GOTO(parser.parse_value(addresses, addresses.array_continue));
+
+        array_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            goto main_array_switch;
+          case ']':
+            parser.end_array();
+            goto scope_end;
+          default:
+            parser.log_error("Missing comma between array values");
+            goto error;
+          }
+
+        finish:
+          return parser.finish();
+
+        error:
+          return parser.error();
+        }
+
+      } // namespace
+    }   // namespace stage2
+
+    /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
+    WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept
+    {
+      error_code result = stage2::parse_structurals<false>(*this, _doc);
+      if (result)
+      {
+        return result;
+      }
 
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+      // If we didn't make it to the end, it's an error
+      if (next_structural_index != n_structural_indexes)
+      {
+        logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+        return error = TAPE_ERROR;
+      }
 
-//
-// Object parser states
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+      return SUCCESS;
+    }
 
-object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+    /************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+    WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept
+    {
+      return stage2::parse_structurals<true>(*this, _doc);
+    }
+    /* end file src/generic/stage2/tape_writer.h */
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+    WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept
+    {
+      error_code err = stage1(_buf, _len, false);
+      if (err)
+      {
+        return err;
+      }
+      return stage2(_doc);
+    }
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+  } // namespace haswell
+} // namespace simdjson
+UNTARGET_REGION
+/* end file src/generic/stage2/tape_writer.h */
+#endif
+#if SIMDJSON_IMPLEMENTATION_WESTMERE
+/* begin file src/westmere/implementation.cpp */
+/* westmere/implementation.h already included: #include "westmere/implementation.h" */
+/* begin file src/westmere/dom_parser_implementation.h */
+#ifndef SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
 
-//
-// Array parser states
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
+/* isadetection.h already included: #include "isadetection.h" */
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+namespace simdjson
+{
+  namespace westmere
+  {
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+    /* begin file src/generic/dom_parser_implementation.h */
+    // expectation: sizeof(scope_descriptor) = 64/8.
+    struct scope_descriptor
+    {
+      uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+      uint32_t count;      // how many elements in the scope
+    };                     // struct scope_descriptor
 
-finish:
-  return parser.finish();
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+    typedef void *ret_address_t;
+#else
+    typedef char ret_address_t;
+#endif
 
-error:
-  return parser.error();
-}
+    class dom_parser_implementation final : public internal::dom_parser_implementation
+    {
+    public:
+      /** Tape location of each open { or [ */
+      std::unique_ptr<scope_descriptor[]> containing_scope{};
+      /** Return address of each open { or [ */
+      std::unique_ptr<ret_address_t[]> ret_address{};
+      /** Buffer passed to stage 1 */
+      const uint8_t *buf{};
+      /** Length passed to stage 1 */
+      size_t len{0};
+      /** Document passed to stage 2 */
+      dom::document *doc{};
+      /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+      error_code error{UNINITIALIZED};
 
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
-  }
-  return code;
-}
-/* end file src/generic/stage2_build_tape.h */
-/* begin file src/generic/stage2_streaming_build_tape.h */
-namespace stage2 {
+      really_inline dom_parser_implementation();
+      dom_parser_implementation(const dom_parser_implementation &) = delete;
+      dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
 
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, parser &_doc_parser, size_t _i) : structural_parser(_buf, _len, _doc_parser, _i) {}
+      WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+      WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+      WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+      WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+      WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
+    };
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return doc_parser.on_error(DEPTH_ERROR);
+    /* begin file src/generic/stage1/allocate.h */
+    namespace stage1
+    {
+      namespace allocate
+      {
+
+        //
+        // Allocates stage 1 internal state and outputs in the parser
+        //
+        really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity)
+        {
+          size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+          parser.structural_indexes.reset(new (std::nothrow) uint32_t[max_structures]);
+          if (!parser.structural_indexes)
+          {
+            return MEMALLOC;
+          }
+          parser.structural_indexes[0] = 0;
+          parser.n_structural_indexes = 0;
+          return SUCCESS;
+        }
+
+      } // namespace allocate
+    }   // namespace stage1
+    /* end file src/generic/stage1/allocate.h */
+    /* begin file src/generic/stage2/allocate.h */
+    namespace stage2
+    {
+      namespace allocate
+      {
+
+        //
+        // Allocates stage 2 internal state and outputs in the parser
+        //
+        really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth)
+        {
+          parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+          parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
+
+          if (!parser.ret_address || !parser.containing_scope)
+          {
+            return MEMALLOC;
+          }
+          return SUCCESS;
+        }
+
+      } // namespace allocate
+    }   // namespace stage2
+    /* end file src/generic/stage2/allocate.h */
+
+    really_inline dom_parser_implementation::dom_parser_implementation() {}
+
+    // Leaving these here so they can be inlined if so desired
+    WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept
+    {
+      error_code err = stage1::allocate::set_capacity(*this, capacity);
+      if (err)
+      {
+        _capacity = 0;
+        return err;
+      }
+      _capacity = capacity;
+      return SUCCESS;
     }
-    return SUCCESS;
-  }
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
+    WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept
+    {
+      error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+      if (err)
+      {
+        _max_depth = 0;
+        return err;
+      }
+      _max_depth = max_depth;
+      return SUCCESS;
     }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return doc_parser.on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
-  }
-};
+    /* end file src/generic/stage2/allocate.h */
 
-} // namespace stage2
+  } // namespace westmere
+} // namespace simdjson
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, next_json);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+#endif // SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
 
-//
-// Object parser parsers
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+TARGET_HASWELL
 
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+namespace simdjson
+{
+  namespace westmere
+  {
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+    WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+        size_t capacity,
+        size_t max_depth,
+        std::unique_ptr<internal::dom_parser_implementation> &dst) const noexcept
+    {
+      dst.reset(new (std::nothrow) dom_parser_implementation());
+      if (!dst)
+      {
+        return MEMALLOC;
+      }
+      dst->set_capacity(capacity);
+      dst->set_max_depth(max_depth);
+      return SUCCESS;
+    }
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+  } // namespace westmere
+} // namespace simdjson
 
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/westmere/dom_parser_implementation.cpp */
+/* westmere/implementation.h already included: #include "westmere/implementation.h" */
+/* westmere/dom_parser_implementation.h already included: #include "westmere/dom_parser_implementation.h" */
+
 //
-// Array parser parsers
+// Stage 1
 //
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
+/* begin file src/westmere/bitmask.h */
+#ifndef SIMDJSON_WESTMERE_BITMASK_H
+#define SIMDJSON_WESTMERE_BITMASK_H
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+/* begin file src/westmere/intrinsics.h */
+#ifndef SIMDJSON_WESTMERE_INTRINSICS_H
+#define SIMDJSON_WESTMERE_INTRINSICS_H
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+#ifdef SIMDJSON_VISUAL_STUDIO
+// under clang within visual studio, this will include <x86intrin.h>
+#include <intrin.h> // visual studio or clang
+#else
+#include <x86intrin.h> // elsewhere
+#endif                 // SIMDJSON_VISUAL_STUDIO
 
-finish:
-  next_json = parser.structurals.next_structural_index();
-  return parser.finish();
+#ifdef SIMDJSON_CLANG_VISUAL_STUDIO
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ */
+#include <smmintrin.h> // for _mm_alignr_epi8
+#include <wmmintrin.h> // for  _mm_clmulepi64_si128
+#endif
 
-error:
-  return parser.error();
-}
-/* end file src/generic/stage2_streaming_build_tape.h */
+#endif // SIMDJSON_WESTMERE_INTRINSICS_H
+/* end file src/westmere/intrinsics.h */
 
+TARGET_WESTMERE
+namespace simdjson
+{
+  namespace westmere
+  {
+
+    //
+    // Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+    //
+    // For example, prefix_xor(00100100) == 00011100
+    //
+    really_inline uint64_t prefix_xor(const uint64_t bitmask)
+    {
+      // There should be no such thing with a processing supporting avx2
+      // but not clmul.
+      __m128i all_ones = _mm_set1_epi8('\xFF');
+      __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+      return _mm_cvtsi128_si64(result);
+    }
+
+  } // namespace westmere
+
 } // namespace simdjson
+UNTARGET_REGION
 
-#endif // SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H
-/* end file src/generic/stage2_streaming_build_tape.h */
+#endif // SIMDJSON_WESTMERE_BITMASK_H
+/* end file src/westmere/intrinsics.h */
+/* begin file src/westmere/simd.h */
+#ifndef SIMDJSON_WESTMERE_SIMD_H
+#define SIMDJSON_WESTMERE_SIMD_H
+
+/* simdprune_tables.h already included: #include "simdprune_tables.h" */
+/* begin file src/westmere/bitmanipulation.h */
+#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H
+#define SIMDJSON_WESTMERE_BITMANIPULATION_H
+
+/* westmere/intrinsics.h already included: #include "westmere/intrinsics.h" */
+
+TARGET_WESTMERE
+namespace simdjson
+{
+  namespace westmere
+  {
+
+    // We sometimes call trailing_zero on inputs that are zero,
+    // but the algorithms do not end up using the returned value.
+    // Sadly, sanitizers are not smart enough to figure it out.
+    NO_SANITIZE_UNDEFINED
+    really_inline int trailing_zeroes(uint64_t input_num)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      unsigned long ret;
+      // Search the mask data from least significant bit (LSB)
+      // to the most significant bit (MSB) for a set bit (1).
+      _BitScanForward64(&ret, input_num);
+      return (int)ret;
+#else  // SIMDJSON_REGULAR_VISUAL_STUDIO
+      return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+    }
+
+    /* result might be undefined when input_num is zero */
+    really_inline uint64_t clear_lowest_bit(uint64_t input_num)
+    {
+      return input_num & (input_num - 1);
+    }
+
+    /* result might be undefined when input_num is zero */
+    really_inline int leading_zeroes(uint64_t input_num)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      unsigned long leading_zero = 0;
+      // Search the mask data from most significant bit (MSB)
+      // to least significant bit (LSB) for a set bit (1).
+      if (_BitScanReverse64(&leading_zero, input_num))
+        return (int)(63 - leading_zero);
+      else
+        return 64;
+#else
+      return __builtin_clzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+    }
+
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+    really_inline unsigned __int64 count_ones(uint64_t input_num)
+    {
+      // note: we do not support legacy 32-bit Windows
+      return __popcnt64(input_num); // Visual Studio wants two underscores
+    }
+#else
+    really_inline long long int count_ones(uint64_t input_num)
+    {
+      return _popcnt64(input_num);
+    }
 #endif
-#if SIMDJSON_IMPLEMENTATION_HASWELL
-/* begin file src/haswell/stage2_build_tape.h */
-#ifndef SIMDJSON_HASWELL_STAGE2_BUILD_TAPE_H
-#define SIMDJSON_HASWELL_STAGE2_BUILD_TAPE_H
 
-/* haswell/implementation.h already included: #include "haswell/implementation.h" */
-/* begin file src/haswell/stringparsing.h */
-#ifndef SIMDJSON_HASWELL_STRINGPARSING_H
-#define SIMDJSON_HASWELL_STRINGPARSING_H
+    really_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                    uint64_t *result)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      return _addcarry_u64(0, value1, value2,
+                           reinterpret_cast<unsigned __int64 *>(result));
+#else
+      return __builtin_uaddll_overflow(value1, value2,
+                                       (unsigned long long *)result);
+#endif
+    }
 
-/* jsoncharutils.h already included: #include "jsoncharutils.h" */
-/* haswell/simd.h already included: #include "haswell/simd.h" */
-/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
-/* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#pragma intrinsic(_umul128)
+#endif
+    really_inline bool mul_overflow(uint64_t value1, uint64_t value2,
+                                    uint64_t *result)
+    {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      uint64_t high;
+      *result = _umul128(value1, value2, &high);
+      return high;
+#else
+      return __builtin_umulll_overflow(value1, value2,
+                                       (unsigned long long *)result);
+#endif
+    }
 
-TARGET_HASWELL
-namespace simdjson::haswell {
+  } // namespace westmere
 
-using namespace simd;
+} // namespace simdjson
+UNTARGET_REGION
 
-// Holds backslashes and quotes locations.
-struct backslash_and_quote {
-public:
-  static constexpr uint32_t BYTES_PROCESSED = 32;
-  really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H
+/* end file src/westmere/bitmanipulation.h */
+/* westmere/intrinsics.h already included: #include "westmere/intrinsics.h" */
 
-  really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
-  really_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
-  really_inline int quote_index() { return trailing_zeroes(quote_bits); }
-  really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
+TARGET_WESTMERE
+namespace simdjson
+{
+  namespace westmere
+  {
+    namespace simd
+    {
 
-  uint32_t bs_bits;
-  uint32_t quote_bits;
-}; // struct backslash_and_quote
+      template <typename Child>
+      struct base
+      {
+        __m128i value;
 
-really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
-  // this can read up to 15 bytes beyond the buffer size, but we require
-  // SIMDJSON_PADDING of padding
-  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1));
-  simd8<uint8_t> v(src);
-  // store to dest unconditionally - we can overwrite the bits we don't like later
-  v.store(dst);
-  return {
-      (uint32_t)(v == '\\').to_bitmask(),     // bs_bits
-      (uint32_t)(v == '"').to_bitmask(), // quote_bits
-  };
-}
+        // Zero constructor
+        really_inline base() : value{__m128i()} {}
 
-/* begin file src/generic/stringparsing.h */
-// This file contains the common code every implementation uses
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "stringparsing.h" (this simplifies amalgation)
+        // Conversion from SIMD register
+        really_inline base(const __m128i _value) : value(_value) {}
 
-namespace stringparsing {
+        // Conversion to SIMD register
+        really_inline operator const __m128i &() const { return this->value; }
+        really_inline operator __m128i &() { return this->value; }
 
-// begin copypasta
-// These chars yield themselves: " \ /
-// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
-// u not handled in this table as it's complex
-static const uint8_t escape_map[256] = {
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+        // Bit operations
+        really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
+        really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
+        really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
+        really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
+        really_inline Child &operator|=(const Child other)
+        {
+          auto this_cast = (Child *)this;
+          *this_cast = *this_cast | other;
+          return *this_cast;
+        }
+        really_inline Child &operator&=(const Child other)
+        {
+          auto this_cast = (Child *)this;
+          *this_cast = *this_cast & other;
+          return *this_cast;
+        }
+        really_inline Child &operator^=(const Child other)
+        {
+          auto this_cast = (Child *)this;
+          *this_cast = *this_cast ^ other;
+          return *this_cast;
+        }
+      };
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
-    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
-    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+      // Forward-declared so they can be used by splat and friends.
+      template <typename T>
+      struct simd8;
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+      template <typename T, typename Mask = simd8<bool>>
+      struct base8 : base<simd8<T>>
+      {
+        typedef uint16_t bitmask_t;
+        typedef uint32_t bitmask2_t;
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-};
+        really_inline base8() : base<simd8<T>>() {}
+        really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
 
-// handle a unicode codepoint
-// write appropriate values into dest
-// src will advance 6 bytes or 12 bytes
-// dest will advance a variable amount (return via pointer)
-// return true if the unicode codepoint was valid
-// We work in little-endian then swap at write time
-WARN_UNUSED
-really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
-                                            uint8_t **dst_ptr) {
-  // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
-  // conversion isn't valid; we defer the check for this to inside the
-  // multilingual plane check
-  uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
-  *src_ptr += 6;
-  // check for low surrogate for characters outside the Basic
-  // Multilingual Plane.
-  if (code_point >= 0xd800 && code_point < 0xdc00) {
-    if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
-      return false;
-    }
-    uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
+        really_inline Mask operator==(const simd8<T> other) const { return _mm_cmpeq_epi8(*this, other); }
 
-    // if the first code point is invalid we will get here, as we will go past
-    // the check for being outside the Basic Multilingual plane. If we don't
-    // find a \u immediately afterwards we fail out anyhow, but if we do,
-    // this check catches both the case of the first code point being invalid
-    // or the second code point being invalid.
-    if ((code_point | code_point_2) >> 16) {
-      return false;
-    }
+        static const int SIZE = sizeof(base<simd8<T>>::value);
 
-    code_point =
-        (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
-    *src_ptr += 6;
-  }
-  size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
-  *dst_ptr += offset;
-  return offset > 0;
-}
+        template <int N = 1>
+        really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+        {
+          return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+        }
+      };
 
-WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
-  src++;
-  while (1) {
-    // Copy the next n bytes, and find the backslash and quote in them.
-    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
-    // If the next thing is the end quote, copy and return
-    if (bs_quote.has_quote_first()) {
-      // we encountered quotes first. Move dst to point to quotes and exit
-      return dst + bs_quote.quote_index();
-    }
-    if (bs_quote.has_backslash()) {
-      /* find out where the backspace is */
-      auto bs_dist = bs_quote.backslash_index();
-      uint8_t escape_char = src[bs_dist + 1];
-      /* we encountered backslash first. Handle backslash */
-      if (escape_char == 'u') {
-        /* move src/dst up to the start; they will be further adjusted
-           within the unicode codepoint handling code. */
-        src += bs_dist;
-        dst += bs_dist;
-        if (!handle_unicode_codepoint(&src, &dst)) {
-          return nullptr;
+      // SIMD byte mask type (returned by things like eq and gt)
+      template <>
+      struct simd8<bool> : base8<bool>
+      {
+        static really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
+
+        really_inline simd8<bool>() : base8() {}
+        really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
+        // Splat constructor
+        really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+
+        really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
+        really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+        really_inline simd8<bool> operator~() const { return *this ^ true; }
+      };
+
+      template <typename T>
+      struct base8_numeric : base8<T>
+      {
+        static really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
+        static really_inline simd8<T> zero() { return _mm_setzero_si128(); }
+        static really_inline simd8<T> load(const T values[16])
+        {
+          return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
         }
-      } else {
-        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
-         * write bs_dist+1 characters to output
-         * note this may reach beyond the part of the buffer we've actually
-         * seen. I think this is ok */
-        uint8_t escape_result = escape_map[escape_char];
-        if (escape_result == 0u) {
-          return nullptr; /* bogus escape value is an error */
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        static really_inline simd8<T> repeat_16(
+            T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+            T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+        {
+          return simd8<T>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
         }
-        dst[bs_dist] = escape_result;
-        src += bs_dist + 2;
-        dst += bs_dist + 1;
-      }
-    } else {
-      /* they are the same. Since they can't co-occur, it means we
-       * encountered neither. */
-      src += backslash_and_quote::BYTES_PROCESSED;
-      dst += backslash_and_quote::BYTES_PROCESSED;
-    }
-  }
-  /* can't be reached */
-  return nullptr;
-}
 
-} // namespace stringparsing
-/* end file src/generic/stringparsing.h */
+        really_inline base8_numeric() : base8<T>() {}
+        really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
 
-} // namespace simdjson::haswell
-UNTARGET_REGION
+        // Store to array
+        really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
 
-#endif // SIMDJSON_HASWELL_STRINGPARSING_H
-/* end file src/generic/stringparsing.h */
-/* begin file src/haswell/numberparsing.h */
-#ifndef SIMDJSON_HASWELL_NUMBERPARSING_H
-#define SIMDJSON_HASWELL_NUMBERPARSING_H
+        // Override to distinguish from bool version
+        really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
 
+        // Addition/subtraction are the same for signed and unsigned
+        really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
+        really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
+        really_inline simd8<T> &operator+=(const simd8<T> other)
+        {
+          *this = *this + other;
+          return *(simd8<T> *)this;
+        }
+        really_inline simd8<T> &operator-=(const simd8<T> other)
+        {
+          *this = *this - other;
+          return *(simd8<T> *)this;
+        }
 
-/* jsoncharutils.h already included: #include "jsoncharutils.h" */
-/* haswell/intrinsics.h already included: #include "haswell/intrinsics.h" */
-/* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
-#include <cmath>
-#include <limits>
+        // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+        template <typename L>
+        really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+        {
+          return _mm_shuffle_epi8(lookup_table, *this);
+        }
 
-#ifdef JSON_TEST_NUMBERS // for unit testing
-void found_invalid_number(const uint8_t *buf);
-void found_integer(int64_t result, const uint8_t *buf);
-void found_unsigned_integer(uint64_t result, const uint8_t *buf);
-void found_float(double result, const uint8_t *buf);
-#endif
+        // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+        // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+        // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+        // get written.
+        // Design consideration: it seems like a function with the
+        // signature simd8<L> compress(uint32_t mask) would be
+        // sensible, but the AVX ISA makes this kind of approach difficult.
+        template <typename L>
+        really_inline void compress(uint16_t mask, L *output) const
+        {
+          // this particular implementation was inspired by work done by @animetosho
+          // we do it in two steps, first 8 bytes and then second 8 bytes
+          uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+          uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+          // next line just loads the 64-bit values thintable_epi8[mask1] and
+          // thintable_epi8[mask2] into a 128-bit register, using only
+          // two instructions on most compilers.
+          __m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
+          // we increment by 0x08 the second half of the mask
+          shufmask =
+              _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+          // this is the version "nearly pruned"
+          __m128i pruned = _mm_shuffle_epi8(*this, shufmask);
+          // we still need to put the two halves together.
+          // we compute the popcount of the first half:
+          int pop1 = BitsSetTable256mul2[mask1];
+          // then load the corresponding mask, what it does is to write
+          // only the first pop1 bytes from the first 8 bytes, and then
+          // it fills in with the bytes from the second 8 bytes + some filling
+          // at the end.
+          __m128i compactmask =
+              _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8));
+          __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+          _mm_storeu_si128((__m128i *)(output), answer);
+        }
 
-TARGET_HASWELL
-namespace simdjson::haswell {
-static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
-  // this actually computes *16* values so we are being wasteful.
-  const __m128i ascii0 = _mm_set1_epi8('0');
-  const __m128i mul_1_10 =
-      _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
-  const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
-  const __m128i mul_1_10000 =
-      _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
-  const __m128i input = _mm_sub_epi8(
-      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
-  const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
-  const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
-  const __m128i t3 = _mm_packus_epi32(t2, t2);
-  const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
-  return _mm_cvtsi128_si32(
-      t4); // only captures the sum of the first 8 digits, drop the rest
-}
+        template <typename L>
+        really_inline simd8<L> lookup_16(
+            L replace0, L replace1, L replace2, L replace3,
+            L replace4, L replace5, L replace6, L replace7,
+            L replace8, L replace9, L replace10, L replace11,
+            L replace12, L replace13, L replace14, L replace15) const
+        {
+          return lookup_16(simd8<L>::repeat_16(
+              replace0, replace1, replace2, replace3,
+              replace4, replace5, replace6, replace7,
+              replace8, replace9, replace10, replace11,
+              replace12, replace13, replace14, replace15));
+        }
+      };
 
-#define SWAR_NUMBER_PARSING
+      // Signed bytes
+      template <>
+      struct simd8<int8_t> : base8_numeric<int8_t>
+      {
+        really_inline simd8() : base8_numeric<int8_t>() {}
+        really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
+        // Splat constructor
+        really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+        // Array constructor
+        really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+        // Member-by-member initialization
+        really_inline simd8(
+            int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+            int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15) : simd8(_mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                                v8, v9, v10, v11, v12, v13, v14, v15)) {}
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        really_inline static simd8<int8_t> repeat_16(
+            int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+            int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        {
+          return simd8<int8_t>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
+        }
 
-/* begin file src/generic/numberparsing.h */
-namespace numberparsing {
+        // Order-sensitive comparisons
+        really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
+        really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
+        really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
+        really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
+      };
 
+      // Unsigned bytes
+      template <>
+      struct simd8<uint8_t> : base8_numeric<uint8_t>
+      {
+        really_inline simd8() : base8_numeric<uint8_t>() {}
+        really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
+        // Splat constructor
+        really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+        // Array constructor
+        really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
+        // Member-by-member initialization
+        really_inline simd8(
+            uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+            uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) : simd8(_mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7,
+                                                                                                                                        v8, v9, v10, v11, v12, v13, v14, v15)) {}
+        // Repeat 16 values as many times as necessary (usually for lookup tables)
+        really_inline static simd8<uint8_t> repeat_16(
+            uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+            uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        {
+          return simd8<uint8_t>(
+              v0, v1, v2, v3, v4, v5, v6, v7,
+              v8, v9, v10, v11, v12, v13, v14, v15);
+        }
 
-// Attempts to compute i * 10^(power) exactly; and if "negative" is
-// true, negate the result.
-// This function will only work in some cases, when it does not work, success is
-// set to false. This should work *most of the time* (like 99% of the time).
-// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
-// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
-really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
-                                      bool *success) {
-  // we start with a fast path
-  // It was described in
-  // Clinger WD. How to read floating point numbers accurately.
-  // ACM SIGPLAN Notices. 1990
-  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
-    // convert the integer into a double. This is lossless since
-    // 0 <= i <= 2^53 - 1.
-    double d = i;
-    //
-    // The general idea is as follows.
-    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
-    // 1) Both s and p can be represented exactly as 64-bit floating-point
-    // values
-    // (binary64).
-    // 2) Because s and p can be represented exactly as floating-point values,
-    // then s * p
-    // and s / p will produce correctly rounded values.
-    //
-    if (power < 0) {
-      d = d / power_of_ten[-power];
-    } else {
-      d = d * power_of_ten[power];
-    }
-    if (negative) {
-      d = -d;
-    }
-    *success = true;
-    return d;
-  }
-  // When 22 < power && power <  22 + 16, we could
-  // hope for another, secondary fast path.  It wa
-  // described by David M. Gay in  "Correctly rounded
-  // binary-decimal and decimal-binary conversions." (1990)
-  // If you need to compute i * 10^(22 + x) for x < 16,
-  // first compute i * 10^x, if you know that result is exact
-  // (e.g., when i * 10^x < 2^53),
-  // then you can still proceed and do (i * 10^x) * 10^22.
-  // Is this worth your time?
-  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
-  // for this second fast path to work.
-  // If you you have 22 < power *and* power <  22 + 16, and then you
-  // optimistically compute "i * 10^(x-22)", there is still a chance that you
-  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
-  // this optimization maybe less common than we would like. Source:
-  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
-  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+        // Saturated math
+        really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
+        really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
 
-  // The fast path has now failed, so we are failing back on the slower path.
+        // Order-specific operations
+        really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
+        really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
+        // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+        really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
+        // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+        really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
+        really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; }
+        really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; }
+        really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+        really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
 
-  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
-  // possible, except if i == 0, so we handle i == 0 separately.
-  if(i == 0) {
-    return 0.0;
-  }
+        // Bit-specific operations
+        really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
+        really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
+        really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+        really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
+        really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+        really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+        really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
+        really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
+        template <int N>
+        really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+        template <int N>
+        really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+        // Get one of the bits and make a bitmask out of it.
+        // e.g. value.get_bit<7>() gets the high bit
+        template <int N>
+        really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N)); }
+      };
 
-  // We are going to need to do some 64-bit arithmetic to get a more precise product.
-  // We use a table lookup approach.
-  components c =
-      power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
-      // safe because
-      // power >= FASTFLOAT_SMALLEST_POWER
-      // and power <= FASTFLOAT_LARGEST_POWER
-  // we recover the mantissa of the power, it has a leading 1. It is always
-  // rounded down.
-  uint64_t factor_mantissa = c.mantissa;
+      template <typename T>
+      struct simd8x64
+      {
+        static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+        const simd8<T> chunks[NUM_CHUNKS];
 
-  // We want the most significant bit of i to be 1. Shift if needed.
-  int lz = leading_zeroes(i);
-  i <<= lz;
-  // We want the most significant 64 bits of the product. We know
-  // this will be non-zero because the most significant bit of i is
-  // 1.
-  value128 product = full_multiplication(i, factor_mantissa);
-  uint64_t lower = product.low;
-  uint64_t upper = product.high;
+        really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
+        really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+        really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16), simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
 
-  // We know that upper has at most one leading zero because
-  // both i and  factor_mantissa have a leading one. This means
-  // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
+        really_inline void store(T ptr[64]) const
+        {
+          this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
+          this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
+          this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
+          this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
+        }
 
-  // As long as the first 9 bits of "upper" are not "1", then we
-  // know that we have an exact computed value for the leading
-  // 55 bits because any imprecision would play out as a +1, in
-  // the worst case.
-  if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower)) {
-    uint64_t factor_mantissa_low =
-        mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
-    // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
-    // result (three 64-bit values)
-    product = full_multiplication(i, factor_mantissa_low);
-    uint64_t product_low = product.low;
-    uint64_t product_middle2 = product.high;
-    uint64_t product_middle1 = lower;
-    uint64_t product_high = upper;
-    uint64_t product_middle = product_middle1 + product_middle2;
-    if (product_middle < product_middle1) {
-      product_high++; // overflow carry
-    }
-    // We want to check whether mantissa *i + i would affect our result.
-    // This does happen, e.g. with 7.3177701707893310e+15.
-    if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
-         (product_low + i < product_low))) { // let us be prudent and bail out.
-      *success = false;
-      return 0;
-    }
-    upper = product_high;
-    lower = product_middle;
-  }
-  // The final mantissa should be 53 bits with a leading 1.
-  // We shift it so that it occupies 54 bits with a leading 1.
-  ///////
-  uint64_t upperbit = upper >> 63;
-  uint64_t mantissa = upper >> (upperbit + 9);
-  lz += 1 ^ upperbit;
+        really_inline void compress(uint64_t mask, T *output) const
+        {
+          this->chunks[0].compress(uint16_t(mask), output);
+          this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
+          this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
+          this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+        }
 
-  // Here we have mantissa < (1<<54).
+        template <typename F>
+        static really_inline void each_index(F const &each)
+        {
+          each(0);
+          each(1);
+          each(2);
+          each(3);
+        }
 
-  // We have to round to even. The "to even" part
-  // is only a problem when we are right in between two floats
-  // which we guard against.
-  // If we have lots of trailing zeros, we may fall right between two
-  // floating-point values.
-  if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
-               ((mantissa & 3) == 1))) {
-      // if mantissa & 1 == 1 we might need to round up.
-      //
-      // Scenarios:
-      // 1. We are not in the middle. Then we should round up.
-      //
-      // 2. We are right in the middle. Whether we round up depends
-      // on the last significant bit: if it is "one" then we round
-      // up (round to even) otherwise, we do not.
-      //
-      // So if the last significant bit is 1, we can safely round up.
-      // Hence we only need to bail out if (mantissa & 3) == 1.
-      // Otherwise we may need more accuracy or analysis to determine whether
-      // we are exactly between two floating-point numbers.
-      // It can be triggered with 1e23.
-      // Note: because the factor_mantissa and factor_mantissa_low are
-      // almost always rounded down (except for small positive powers),
-      // almost always should round up.
-      *success = false;
-      return 0;
-  }
+        really_inline uint64_t to_bitmask() const
+        {
+          uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+          uint64_t r1 = this->chunks[1].to_bitmask();
+          uint64_t r2 = this->chunks[2].to_bitmask();
+          uint64_t r3 = this->chunks[3].to_bitmask();
+          return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+        }
 
-  mantissa += mantissa & 1;
-  mantissa >>= 1;
+        really_inline simd8x64<T> bit_or(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<T>(
+              this->chunks[0] | mask,
+              this->chunks[1] | mask,
+              this->chunks[2] | mask,
+              this->chunks[3] | mask);
+        }
 
-  // Here we have mantissa < (1<<53), unless there was an overflow
-  if (mantissa >= (1ULL << 53)) {
-    //////////
-    // This will happen when parsing values such as 7.2057594037927933e+16
-    ////////
-    mantissa = (1ULL << 52);
-    lz--; // undo previous addition
-  }
-  mantissa &= ~(1ULL << 52);
-  uint64_t real_exponent = c.exp - lz;
-  // we have to check that real_exponent is in range, otherwise we bail out
-  if (unlikely((real_exponent < 1) || (real_exponent > 2046))) {
-    *success = false;
-    return 0;
-  }
-  mantissa |= real_exponent << 52;
-  mantissa |= (((uint64_t)negative) << 63);
-  double d;
-  memcpy(&d, &mantissa, sizeof(d));
-  *success = true;
-  return d;
-}
+        really_inline uint64_t eq(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<bool>(
+                     this->chunks[0] == mask,
+                     this->chunks[1] == mask,
+                     this->chunks[2] == mask,
+                     this->chunks[3] == mask)
+              .to_bitmask();
+        }
 
-static bool parse_float_strtod(const char *ptr, double *outDouble) {
-  char *endptr;
-  *outDouble = strtod(ptr, &endptr);
-  // Some libraries will set errno = ERANGE when the value is subnormal,
-  // yet we may want to be able to parse subnormal values.
-  // However, we do not want to tolerate NAN or infinite values.
-  //
-  // Values like infinity or NaN are not allowed in the JSON specification.
-  // If you consume a large value and you map it to "infinity", you will no
-  // longer be able to serialize back a standard-compliant JSON. And there is
-  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
-  // 10^308 It is an unimaginable large number. There will never be any piece of
-  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
-  // of electrons is similar. Using a double-precision floating-point value, we
-  // can represent easily the number of atoms in the universe. We could  also
-  // represent the number of ways you can pick any three individual atoms at
-  // random in the universe. If you ever encounter a number much larger than
-  // 10^308, you know that you have a bug. RapidJSON will reject a document with
-  // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
-  // will flat out throw an exception.
-  //
-  if ((endptr == ptr) || (!std::isfinite(*outDouble))) {
-    return false;
-  }
-  return true;
-}
+        really_inline uint64_t lteq(const T m) const
+        {
+          const simd8<T> mask = simd8<T>::splat(m);
+          return simd8x64<bool>(
+                     this->chunks[0] <= mask,
+                     this->chunks[1] <= mask,
+                     this->chunks[2] <= mask,
+                     this->chunks[3] <= mask)
+              .to_bitmask();
+        }
+      }; // struct simd8x64<T>
 
-really_inline bool is_integer(char c) {
-  return (c >= '0' && c <= '9');
-  // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
-}
+    } // namespace simd
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  } // namespace westmere
+} // namespace simdjson
+UNTARGET_REGION
 
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
+#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
+/* end file src/westmere/bitmanipulation.h */
+/* westmere/bitmanipulation.h already included: #include "westmere/bitmanipulation.h" */
+/* westmere/implementation.h already included: #include "westmere/implementation.h" */
 
-// check quickly whether the next 8 chars are made of digits
-// at a glance, it looks better than Mula's
-// http://0x80.pl/articles/swar-digits-validate.html
-really_inline bool is_made_of_eight_digits_fast(const char *chars) {
-  uint64_t val;
-  // this can read up to 7 bytes beyond the buffer size, but we require
-  // SIMDJSON_PADDING of padding
-  static_assert(7 <= SIMDJSON_PADDING);
-  memcpy(&val, chars, 8);
-  // a branchy method might be faster:
-  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
-  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
-  //  0x3030303030303030);
-  return (((val & 0xF0F0F0F0F0F0F0F0) |
-           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
-          0x3333333333333333);
-}
+TARGET_WESTMERE
+namespace simdjson
+{
+  namespace westmere
+  {
 
-// called by parse_number when we know that the output is an integer,
-// but where there might be some integer overflow.
-// we want to catch overflows!
-// Do not call this function directly as it skips some of the checks from
-// parse_number
-//
-// This function will almost never be called!!!
-//
-never_inline bool parse_large_integer(const uint8_t *const src,
-                                      parser &parser,
-                                      bool found_minus) {
-  const char *p = reinterpret_cast<const char *>(src);
+    using namespace simd;
 
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-  }
-  uint64_t i;
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    i = 0;
-  } else {
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      if (mul_overflow(i, 10, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false; // overflow
-      }
-      if (add_overflow(i, digit, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false; // overflow
-      }
-      ++p;
-    }
-  }
-  if (negative) {
-    if (i > 0x8000000000000000) {
-      // overflows!
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false; // overflow
-    } else if (i == 0x8000000000000000) {
-      // In two's complement, we cannot represent 0x8000000000000000
-      // as a positive signed integer, but the negative version is
-      // possible.
-      constexpr int64_t signed_answer = INT64_MIN;
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    } else {
-      // we can negate safely
-      int64_t signed_answer = -static_cast<int64_t>(i);
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    }
-  } else {
-    // we have a positive integer, the contract is that
-    // we try to represent it as a signed integer and only
-    // fallback on unsigned integers if absolutely necessary.
-    if (i < 0x8000000000000000) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(i, src);
-#endif
-      parser.on_number_s64(i);
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_unsigned_integer(i, src);
-#endif
-      parser.on_number_u64(i);
-    }
-  }
-  return is_structural_or_whitespace(*p);
-}
+    struct json_character_block
+    {
+      static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
 
-bool slow_float_parsing(UNUSED const char * src, parser &parser) {
-  double d;
-  if (parse_float_strtod(src, &d)) {
-    parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_float(d, (const uint8_t *)src);
-#endif
-    return true;
-  }
-#ifdef JSON_TEST_NUMBERS // for unit testing
-  found_invalid_number((const uint8_t *)src);
-#endif
-  return false;
-}
+      really_inline uint64_t whitespace() const { return _whitespace; }
+      really_inline uint64_t op() const { return _op; }
+      really_inline uint64_t scalar() { return ~(op() | whitespace()); }
 
-// parse the number at src
-// define JSON_TEST_NUMBERS for unit testing
-//
-// It is assumed that the number is followed by a structural ({,},],[) character
-// or a white space character. If that is not the case (e.g., when the JSON
-// document is made of a single number), then it is necessary to copy the
-// content and append a space before calling this function.
-//
-// Our objective is accurate parsing (ULP of 0) at high speed.
-really_inline bool parse_number(UNUSED const uint8_t *const src,
-                                UNUSED bool found_minus,
-                                parser &parser) {
-#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
-                                  // useful to skip parsing
-  parser.on_number_s64(0);        // always write zero
-  return true;                    // always succeeds
-#else
-  const char *p = reinterpret_cast<const char *>(src);
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-    if (!is_integer(*p)) { // a negative sign must be followed by an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
+      uint64_t _whitespace;
+      uint64_t _op;
+    };
+
+    really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in)
+    {
+      // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
+      // we can't use the generic lookup_16.
+      auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
+      auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
+
+      // We compute whitespace and op separately. If the code later only use one or the
+      // other, given the fact that all functions are aggressively inlined, we can
+      // hope that useless computations will be omitted. This is namely case when
+      // minifying (we only need whitespace).
+
+      uint64_t whitespace = simd8x64<bool>(
+                                in.chunks[0] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[0])),
+                                in.chunks[1] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[1])),
+                                in.chunks[2] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[2])),
+                                in.chunks[3] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[3])))
+                                .to_bitmask();
+
+      // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
+      uint64_t op = simd8x64<bool>(
+                        (in.chunks[0] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[0] - ',')),
+                        (in.chunks[1] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[1] - ',')),
+                        (in.chunks[2] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[2] - ',')),
+                        (in.chunks[3] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[3] - ',')))
+                        .to_bitmask();
+      return {whitespace, op};
     }
-  }
-  const char *const start_digits = p;
 
-  uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
+    really_inline bool is_ascii(simd8x64<uint8_t> input)
+    {
+      simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
+      return !bits.any_bits_set_anywhere(0b10000000u);
     }
-    i = 0;
-  } else {
-    if (!(is_integer(*p))) { // must start with an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
+
+    really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3)
+    {
+      simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+      simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1);  // Only 111_____ will be > 0
+      simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+      // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+      return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
     }
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      // a multiplication by 10 is cheaper than an arbitrary integer
-      // multiplication
-      i = 10 * i + digit; // might overflow, we will handle the overflow later
-      ++p;
+
+    really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3)
+    {
+      simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1);  // Only 111_____ will be > 0
+      simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+      // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+      return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
     }
-  }
-  int64_t exponent = 0;
-  bool is_float = false;
-  if ('.' == *p) {
-    is_float = true; // At this point we know that we have a float
-    // we continue with the fiction that we have an integer. If the
-    // floating point number is representable as x * 10^z for some integer
-    // z that fits in 53 bits, then we will be able to convert back the
-    // the integer into a float in a lossless manner.
-    ++p;
-    const char *const first_after_period = p;
-    if (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // might overflow + multiplication by 10 is likely
-                          // cheaper than arbitrary mult.
-      // we will handle the overflow later
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
+
+    /* begin file src/generic/stage1/buf_block_reader.h */
+    // Walks through a buffer in block-sized increments, loading the last part with spaces
+    template <size_t STEP_SIZE>
+    struct buf_block_reader
+    {
+    public:
+      really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+      really_inline size_t block_index();
+      really_inline bool has_full_block() const;
+      really_inline const uint8_t *full_block() const;
+      /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+      really_inline size_t get_remainder(uint8_t *dst) const;
+      really_inline void advance();
+
+    private:
+      const uint8_t *buf;
+      const size_t len;
+      const size_t lenminusstep;
+      size_t idx;
+    };
+
+    // Routines to print masks and text for debugging bitmask operations
+    UNUSED static char *format_input_text_64(const uint8_t *text)
+    {
+      static char *buf = (char *)malloc(sizeof(simd8x64<uint8_t>) + 1);
+      for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++)
+      {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+      }
+      buf[sizeof(simd8x64<uint8_t>)] = '\0';
+      return buf;
     }
-#ifdef SWAR_NUMBER_PARSING
-    // this helps if we have lots of decimals!
-    // this turns out to be frequent enough.
-    if (is_made_of_eight_digits_fast(p)) {
-      i = i * 100000000 + parse_eight_digits_unrolled(p);
-      p += 8;
+
+    // Routines to print masks and text for debugging bitmask operations
+    UNUSED static char *format_input_text(const simd8x64<uint8_t> in)
+    {
+      static char *buf = (char *)malloc(sizeof(simd8x64<uint8_t>) + 1);
+      in.store((uint8_t *)buf);
+      for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++)
+      {
+        if (buf[i] < ' ')
+        {
+          buf[i] = '_';
+        }
+      }
+      buf[sizeof(simd8x64<uint8_t>)] = '\0';
+      return buf;
     }
-#endif
-    while (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
-                          // because we have parse_highprecision_float later.
+
+    UNUSED static char *format_mask(uint64_t mask)
+    {
+      static char *buf = (char *)malloc(64 + 1);
+      for (size_t i = 0; i < 64; i++)
+      {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+      }
+      buf[64] = '\0';
+      return buf;
     }
-    exponent = first_after_period - p;
-  }
-  int digit_count =
-      p - start_digits - 1; // used later to guard against overflows
-  int64_t exp_number = 0;   // exponential part
-  if (('e' == *p) || ('E' == *p)) {
-    is_float = true;
-    ++p;
-    bool neg_exp = false;
-    if ('-' == *p) {
-      neg_exp = true;
-      ++p;
-    } else if ('+' == *p) {
-      ++p;
+
+    template <size_t STEP_SIZE>
+    really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+    template <size_t STEP_SIZE>
+    really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+    template <size_t STEP_SIZE>
+    really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+    {
+      return idx < lenminusstep;
     }
-    if (!is_integer(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
+
+    template <size_t STEP_SIZE>
+    really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const
+    {
+      return &buf[idx];
     }
-    unsigned char digit = *p - '0';
-    exp_number = digit;
-    p++;
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
+
+    template <size_t STEP_SIZE>
+    really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const
+    {
+      memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+      memcpy(dst, buf + idx, len - idx);
+      return len - idx;
     }
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
+
+    template <size_t STEP_SIZE>
+    really_inline void buf_block_reader<STEP_SIZE>::advance()
+    {
+      idx += STEP_SIZE;
     }
-    while (is_integer(*p)) {
-      if (exp_number > 0x100000000) { // we need to check for overflows
-                                      // we refuse to parse this
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false;
+    /* end file src/generic/stage1/buf_block_reader.h */
+    /* begin file src/generic/stage1/json_string_scanner.h */
+    namespace stage1
+    {
+
+      struct json_string_block
+      {
+        // Escaped characters (characters following an escape() character)
+        really_inline uint64_t escaped() const { return _escaped; }
+        // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+        really_inline uint64_t escape() const { return _backslash & ~_escaped; }
+        // Real (non-backslashed) quotes
+        really_inline uint64_t quote() const { return _quote; }
+        // Start quotes of strings
+        really_inline uint64_t string_end() const { return _quote & _in_string; }
+        // End quotes of strings
+        really_inline uint64_t string_start() const { return _quote & ~_in_string; }
+        // Only characters inside the string (not including the quotes)
+        really_inline uint64_t string_content() const { return _in_string & ~_quote; }
+        // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+        really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+        // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+        really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+        // Tail of string (everything except the start quote)
+        really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+        // backslash characters
+        uint64_t _backslash;
+        // escaped characters (backslashed--does not include the hex characters after \u)
+        uint64_t _escaped;
+        // real quotes (non-backslashed ones)
+        uint64_t _quote;
+        // string characters (includes start quote but not end quote)
+        uint64_t _in_string;
+      };
+
+      // Scans blocks for string characters, storing the state necessary to do so
+      class json_string_scanner
+      {
+      public:
+        really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
+        really_inline error_code finish(bool streaming);
+
+      private:
+        // Intended to be defined by the implementation
+        really_inline uint64_t find_escaped(uint64_t escape);
+        really_inline uint64_t find_escaped_branchless(uint64_t escape);
+
+        // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+        uint64_t prev_in_string = 0ULL;
+        // Whether the first character of the next iteration is escaped.
+        uint64_t prev_escaped = 0ULL;
+      };
+
+      //
+      // Finds escaped characters (characters following \).
+      //
+      // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+      //
+      // Does this by:
+      // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+      // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+      // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+      //
+      // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+      // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+      // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+      // the start bit causes a carry), and leaves even-bit sequences alone.
+      //
+      // Example:
+      //
+      // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+      // escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+      // odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+      // even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+      // invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+      // follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+      // escaped        |   x  | x x  x x  x x  x  x  |
+      // desired        |   x  | x x  x x  x x  x  x  |
+      // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+      //
+      really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash)
+      {
+        // If there was overflow, pretend the first character isn't a backslash
+        backslash &= ~prev_escaped;
+        uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+        // Get sequences starting on even bits by clearing out the odd series using +
+        const uint64_t even_bits = 0x5555555555555555ULL;
+        uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+        uint64_t sequences_starting_on_even_bits;
+        prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+        uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+        // Mask every other backslashed character as an escaped character
+        // Flip the mask for sequences that start on even bits, to correct them
+        return (even_bits ^ invert_mask) & follows_escape;
       }
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    exponent += (neg_exp ? -exp_number : exp_number);
-  }
-  if (is_float) {
-    // If we frequently had to deal with long strings of digits,
-    // we could extend our code by using a 128-bit integer instead
-    // of a 64-bit integer. However, this is uncommon in practice.
-    if (unlikely((digit_count >= 19))) { // this is uncommon
-      // It is possible that the integer had an overflow.
-      // We have to handle the case where we have 0.0000somenumber.
-      const char *start = start_digits;
-      while ((*start == '0') || (*start == '.')) {
-        start++;
-      }
-      // we over-decrement by one when there is a '.'
-      digit_count -= (start - start_digits);
-      if (digit_count >= 19) {
-        // Ok, chances are good that we had an overflow!
-        // this is almost never going to get called!!!
-        // we start anew, going slowly!!!
-        // This will happen in the following examples:
-        // 10000000000000000000000000000000000000000000e+308
-        // 3.1415926535897932384626433832795028841971693993751
+
+      //
+      // Return a mask of all string characters plus end quotes.
+      //
+      // prev_escaped is overflow saying whether the next character is escaped.
+      // prev_in_string is overflow saying whether we're still in a string.
+      //
+      // Backslash sequences outside of quotes will be detected in stage 2.
+      //
+      really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in)
+      {
+        const uint64_t backslash = in.eq('\\');
+        const uint64_t escaped = find_escaped(backslash);
+        const uint64_t quote = in.eq('"') & ~escaped;
+
         //
-        return slow_float_parsing((const char *) src, parser);
+        // prefix_xor flips on bits inside the string (and flips off the end quote).
+        //
+        // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+        // (characters inside strings are outside, and characters outside strings are inside).
+        //
+        const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+        //
+        // Check if we're still in a string at the end of the box so the next block will know
+        //
+        // right shift of a signed value expected to be well-defined and standard
+        // compliant as of C++20, John Regher from Utah U. says this is fine code
+        //
+        prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
+        // Use ^ to turn the beginning quote off, and the end quote on.
+        return {
+            backslash,
+            escaped,
+            quote,
+            in_string};
       }
-    }
-    if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
-        (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
-      // this is almost never going to get called!!!
-      // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, parser);
-    }
-    bool success = true;
-    double d = compute_float_64(exponent, i, negative, &success);
-    if (!success) {
-      // we are almost never going to get here.
-      success = parse_float_strtod((const char *)src, &d);
-    }
-    if (success) {
-      parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_float(d, src);
-#endif
-      return true;
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-  } else {
-    if (unlikely(digit_count >= 18)) { // this is uncommon!!!
-      // there is a good chance that we had an overflow, so we need
-      // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, parser, found_minus);
-    }
-    i = negative ? 0 - i : i;
-    parser.on_number_s64(i);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_integer(i, src);
-#endif
-  }
-  return is_structural_or_whitespace(*p);
-#endif // SIMDJSON_SKIPNUMBERPARSING
-}
 
-} // namespace numberparsing
-/* end file src/generic/numberparsing.h */
+      really_inline error_code json_string_scanner::finish(bool streaming)
+      {
+        if (prev_in_string and (not streaming))
+        {
+          return UNCLOSED_STRING;
+        }
+        return SUCCESS;
+      }
 
-} // namespace simdjson::haswell
-UNTARGET_REGION
+    } // namespace stage1
+    /* end file src/generic/stage1/json_string_scanner.h */
+    /* begin file src/generic/stage1/json_scanner.h */
+    namespace stage1
+    {
 
-#endif // SIMDJSON_HASWELL_NUMBERPARSING_H
-/* end file src/generic/numberparsing.h */
+      /**
+ * A block of scanned json, with information on operators and scalars.
+ */
+      struct json_block
+      {
+      public:
+        /** The start of structurals */
+        really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
+        /** All JSON whitespace (i.e. not in a string) */
+        really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
 
-TARGET_HASWELL
-namespace simdjson::haswell {
+        // Helpers
 
-/* begin file src/generic/atomparsing.h */
-namespace atomparsing {
+        /** Whether the given characters are inside a string (only works on non-quotes) */
+        really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
+        /** Whether the given characters are outside a string (only works on non-quotes) */
+        really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
 
-really_inline uint32_t string_to_uint32(const char* str) { return *reinterpret_cast<const uint32_t *>(str); }
+        // string and escape characters
+        json_string_block _string;
+        // whitespace, operators, scalars
+        json_character_block _characters;
+        // whether the previous character was a scalar
+        uint64_t _follows_potential_scalar;
 
-WARN_UNUSED
-really_inline bool str4ncmp(const uint8_t *src, const char* atom) {
-  uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
-  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING);
-  std::memcpy(&srcval, src, sizeof(uint32_t));
-  return srcval ^ string_to_uint32(atom);
-}
+      private:
+        // Potential structurals (i.e. disregarding strings)
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src) {
-  return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+        /** operators plus scalar starts like 123, true and "abc" */
+        really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
+        /** the start of non-operator runs, like 123, true and "abc" */
+        really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
+        /** whether the given character is immediately after a non-operator like 123, true or " */
+        really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
+      };
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_true_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "true"); }
-  else { return false; }
-}
+      /**
+ * Scans JSON for important bits: operators, strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+      class json_scanner
+      {
+      public:
+        json_scanner() {}
+        really_inline json_block next(const simd::simd8x64<uint8_t> in);
+        really_inline error_code finish(bool streaming);
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src) {
-  return (str4ncmp(src+1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
-}
+      private:
+        // Whether the last character of the previous iteration is part of a scalar token
+        // (anything except whitespace or an operator).
+        uint64_t prev_scalar = 0ULL;
+        json_string_scanner string_scanner{};
+      };
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
-  if (len > 5) { return is_valid_false_atom(src); }
-  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
-  else { return false; }
-}
+      //
+      // Check if the current character immediately follows a matching character.
+      //
+      // For example, this checks for quotes with backslashes in front of them:
+      //
+      //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+      //
+      really_inline uint64_t follows(const uint64_t match, uint64_t &overflow)
+      {
+        const uint64_t result = match << 1 | overflow;
+        overflow = match >> 63;
+        return result;
+      }
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src) {
-  return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+      //
+      // Check if the current character follows a matching character, with possible "filler" between.
+      // For example, this checks for empty curly braces, e.g.
+      //
+      //     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
+      //
+      really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow)
+      {
+        uint64_t follows_match = follows(match, overflow);
+        uint64_t result;
+        overflow |= uint64_t(add_overflow(follows_match, filler, &result));
+        return result;
+      }
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_null_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "null"); }
-  else { return false; }
-}
+      really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in)
+      {
+        json_string_block strings = string_scanner.next(in);
+        json_character_block characters = json_character_block::classify(in);
+        uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
+        return {
+            strings,
+            characters,
+            follows_scalar};
+      }
 
-} // namespace atomparsing
-/* end file src/generic/atomparsing.h */
-/* begin file src/generic/stage2_build_tape.h */
-// This file contains the common code every implementation uses for stage2
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
+      really_inline error_code json_scanner::finish(bool streaming)
+      {
+        return string_scanner.finish(streaming);
+      }
 
-namespace stage2 {
+    } // namespace stage1
+    /* end file src/generic/stage1/json_scanner.h */
 
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address;
-#define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
-#define GOTO(address) { goto *(address); }
-#define CONTINUE(address) { goto *(address); }
-#else
-typedef char ret_address;
-#define INIT_ADDRESSES() { '[', 'a', 'e', 'f', '{', 'o' };
-#define GOTO(address)                 \
-  {                                   \
-    switch(address) {                 \
-      case '[': goto array_begin;     \
-      case 'a': goto array_continue;  \
-      case 'e': goto error;           \
-      case 'f': goto finish;          \
-      case '{': goto object_begin;    \
-      case 'o': goto object_continue; \
-    }                                 \
-  }
-// For the more constrained end_xxx() situation
-#define CONTINUE(address)             \
-  {                                   \
-    switch(address) {                 \
-      case 'a': goto array_continue;  \
-      case 'o': goto object_continue; \
-      case 'f': goto finish;          \
-    }                                 \
-  }
-#endif
+    namespace stage1
+    {
+      really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash)
+      {
+        if (!backslash)
+        {
+          uint64_t escaped = prev_escaped;
+          prev_escaped = 0;
+          return escaped;
+        }
+        return find_escaped_branchless(backslash);
+      }
+    } // namespace stage1
 
-struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
-};
+    /* begin file src/generic/stage1/json_minifier.h */
+    // This file contains the common code every implementation uses in stage1
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is included already includes
+    // "simdjson/stage1.h" (this simplifies amalgation)
 
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
+    namespace stage1
+    {
 
-class structural_iterator {
-public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf}, len{_len}, structural_indexes{_structural_indexes}, next_structural{next_structural_index} {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
-  }
-  really_inline char current_char() {
-    return c;
-  }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
-  }
-  really_inline size_t remaining_len() {
-    return len - idx;
-  }
-  template<typename F>
-  really_inline bool with_space_terminated_copy(const F& f) {
-    /**
-    * We need to make a copy to make sure that the string is space terminated.
-    * This is not about padding the input, which should already padded up
-    * to len + SIMDJSON_PADDING. However, we have no control at this stage
-    * on how the padding was done. What if the input string was padded with nulls?
-    * It is quite common for an input string to have an extra null character (C string).
-    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
-    * document, but the string "9\0" by itself is fine. So we make a copy and
-    * pad the input with spaces when we know that there is just one input element.
-    * This copy is relatively expensive, but it will almost never be called in
-    * practice unless you are in the strange scenario where you have many JSON
-    * documents made of single atoms.
-    */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if (copy == nullptr) {
-      return true;
-    }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
-    free(copy);
-    return result;
-  }
-  really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
-  }
-  really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
-  }
-  really_inline size_t next_structural_index() {
-    return next_structural;
-  }
+      class json_minifier
+      {
+      public:
+        template <size_t STEP_SIZE>
+        static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
 
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx; // location of the structural character in the input (buf)
-  uint8_t c;  // used to track the (structural) character we are looking at
-};
+      private:
+        really_inline json_minifier(uint8_t *_dst)
+            : dst{_dst}
+        {
+        }
+        template <size_t STEP_SIZE>
+        really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+        really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
+        really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+        json_scanner scanner{};
+        uint8_t *dst;
+      };
 
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
-  uint32_t depth;
+      really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block)
+      {
+        uint64_t mask = block.whitespace();
+        in.compress(mask, dst);
+        dst += 64 - count_ones(mask);
+      }
 
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+      really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len)
+      {
+        *dst = '\0';
+        error_code error = scanner.finish(false);
+        if (error)
+        {
+          dst_len = 0;
+          return error;
+        }
+        dst_len = dst - dst_start;
+        return SUCCESS;
+      }
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    doc_parser.on_start_document(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+      template <>
+      really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block_buf);
+        simd::simd8x64<uint8_t> in_2(block_buf + 64);
+        json_block block_1 = scanner.next(in_1);
+        json_block block_2 = scanner.next(in_2);
+        this->next(in_1, block_1);
+        this->next(in_2, block_2);
+        reader.advance();
+      }
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    doc_parser.on_start_object(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+      template <>
+      really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block_buf);
+        json_block block_1 = scanner.next(in_1);
+        this->next(block_buf, block_1);
+        reader.advance();
+      }
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    doc_parser.on_start_array(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+      template <size_t STEP_SIZE>
+      error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept
+      {
+        buf_block_reader<STEP_SIZE> reader(buf, len);
+        json_minifier minifier(dst);
 
-  really_inline bool end_object() {
-    depth--;
-    doc_parser.on_end_object(depth);
-    return false;
-  }
-  really_inline bool end_array() {
-    depth--;
-    doc_parser.on_end_array(depth);
-    return false;
-  }
-  really_inline bool end_document() {
-    depth--;
-    doc_parser.on_end_document(depth);
-    return false;
-  }
+        // Index the first n-1 blocks
+        while (reader.has_full_block())
+        {
+          minifier.step<STEP_SIZE>(reader.full_block(), reader);
+        }
 
-  WARN_UNUSED really_inline bool parse_string() {
-    uint8_t *dst = doc_parser.on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
-    if (dst == nullptr) {
-      return true;
-    }
-    return !doc_parser.on_end_string(dst);
-  }
+        // Index the last (remainder) block, padded with spaces
+        uint8_t block[STEP_SIZE];
+        if (likely(reader.get_remainder(block)) > 0)
+        {
+          minifier.step<STEP_SIZE>(block, reader);
+        }
 
-  WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    return !numberparsing::parse_number(src, found_minus, doc_parser);
-  }
-  WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
-  }
+        return minifier.finish(dst, dst_len);
+      }
 
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
+    } // namespace stage1
+    /* end file src/generic/stage1/json_minifier.h */
+    WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept
+    {
+      return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
     }
-    return false;
-  }
 
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
+    /* begin file src/generic/stage1/find_next_document_index.h */
+    /**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+    really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser)
+    {
+      // TODO don't count separately, just figure out depth
+      auto arr_cnt = 0;
+      auto obj_cnt = 0;
+      for (auto i = parser.n_structural_indexes - 1; i > 0; i--)
+      {
+        auto idxb = parser.structural_indexes[i];
+        switch (parser.buf[idxb])
+        {
+        case ':':
+        case ',':
+          continue;
+        case '}':
+          obj_cnt--;
+          continue;
+        case ']':
+          arr_cnt--;
+          continue;
+        case '{':
+          obj_cnt++;
+          break;
+        case '[':
+          arr_cnt++;
+          break;
+        }
+        auto idxa = parser.structural_indexes[i - 1];
+        switch (parser.buf[idxa])
+        {
+        case '{':
+        case '[':
+        case ':':
+        case ',':
+          continue;
+        }
+        // Last document is complete, so the next document will appear after!
+        if (!arr_cnt && !obj_cnt)
+        {
+          return parser.n_structural_indexes;
+        }
+        // Last document is incomplete; mark the document at i + 1 as the next one
+        return i;
+      }
+      return 0;
     }
-    return false;
-  }
 
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
-    case '"':
-      FAIL_IF( parse_string() );
-      return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
-      return continue_state;
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      FAIL_IF( parse_number(false) );
-      return continue_state;
-    case '-':
-      FAIL_IF( parse_number(true) );
-      return continue_state;
-    case '{':
-      FAIL_IF( start_object(continue_state) );
-      return addresses.object_begin;
-    case '[':
-      FAIL_IF( start_array(continue_state) );
-      return addresses.array_begin;
-    default:
-      return addresses.error;
+    // Skip the last character if it is partial
+    really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len)
+    {
+      if (unlikely(len < 3))
+      {
+        switch (len)
+        {
+        case 2:
+          if (buf[len - 1] >= 0b11000000)
+          {
+            return len - 1;
+          } // 2-, 3- and 4-byte characters with only 1 byte left
+          if (buf[len - 2] >= 0b11100000)
+          {
+            return len - 2;
+          } // 3- and 4-byte characters with only 2 bytes left
+          return len;
+        case 1:
+          if (buf[len - 1] >= 0b11000000)
+          {
+            return len - 1;
+          } // 2-, 3- and 4-byte characters with only 1 byte left
+          return len;
+        case 0:
+          return len;
+        }
+      }
+      if (buf[len - 1] >= 0b11000000)
+      {
+        return len - 1;
+      } // 2-, 3- and 4-byte characters with only 1 byte left
+      if (buf[len - 2] >= 0b11100000)
+      {
+        return len - 2;
+      } // 3- and 4-byte characters with only 1 byte left
+      if (buf[len - 3] >= 0b11110000)
+      {
+        return len - 3;
+      } // 4-byte characters with only 3 bytes left
+      return len;
     }
-  }
+    /* end file src/generic/stage1/find_next_document_index.h */
+    /* begin file src/generic/stage1/utf8_lookup3_algorithm.h */
+    //
+    // Detect Unicode errors.
+    //
+    // UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
+    // encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
+    // are straight up concatenated into the final value. The first byte of a multibyte character is a
+    // "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
+    // lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
+    // start with 0, because that's what ASCII looks like. Here's what each size looks like:
+    //
+    // - ASCII (7 bits):              0_______
+    // - 2 byte character (11 bits):  110_____ 10______
+    // - 3 byte character (17 bits):  1110____ 10______ 10______
+    // - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
+    // - 5+ byte character (illegal): 11111___ <illegal>
+    //
+    // There are 5 classes of error that can happen in Unicode:
+    //
+    // - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
+    //   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
+    //   character.
+    //
+    //   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
+    //
+    // - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
+    //   We detect this by requiring that the next byte after your multibyte character be a new
+    //   character--so a continuation after your character is wrong.
+    //
+    //   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
+    //
+    // - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
+    //
+    //   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
+    //
+    // - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
+    //   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
+    //   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
+    //
+    //   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
+    //
+    // - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
+    //   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
+    //
+    //   e.g. 11101101 10100000 10000000 (U+D800)
+    //
+    // - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
+    //   support values with more than 23 bits (which a 4-byte character supports).
+    //
+    //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
+    //
+    // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
+    //
+    //   Code Points        1st       2s       3s       4s
+    //  U+0000..U+007F     00..7F
+    //  U+0080..U+07FF     C2..DF   80..BF
+    //  U+0800..U+0FFF     E0       A0..BF   80..BF
+    //  U+1000..U+CFFF     E1..EC   80..BF   80..BF
+    //  U+D000..U+D7FF     ED       80..9F   80..BF
+    //  U+E000..U+FFFF     EE..EF   80..BF   80..BF
+    //  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+    //  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+    //  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+    //
+    using namespace simd;
 
-  WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
+    namespace utf8_validation
+    {
+      // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
-    return doc_parser.on_success(SUCCESS);
-  }
+      //
+      // Find special case UTF-8 errors where the character is technically readable (has the right length)
+      // but the *value* is disallowed.
+      //
+      // This includes overlong encodings, surrogates and values too large for Unicode.
+      //
+      // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
+      // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
+      // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
+      // If all 3 lookups detect the same error, it's an error.
+      //
+      really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+      {
+        //
+        // These are the errors we're going to match for bytes 1-2, by looking at the first three
+        // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
+        //
+        static const int OVERLONG_2 = 0x01;  // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
+        static const int OVERLONG_3 = 0x02;  // 11100000 100_____ ________
+        static const int OVERLONG_4 = 0x04;  // 11110000 1000____ ________ ________
+        static const int SURROGATE = 0x08;   // 11101101 [101_]____
+        static const int TOO_LARGE = 0x10;   // 11110100 (1001|101_)____
+        static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
 
-  WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
-    * pessimistically.
-    * doc_parser.is_valid  = false;
-    * At this point in the code, we have all the time in the world.
-    * Note that we know exactly where we are in the document so we could,
-    * without any overhead on the processing code, report a specific
-    * location.
-    * We could even trigger special code paths to assess what happened
-    * carefully,
-    * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    switch (structurals.current_char()) {
-    case '"':
-      return doc_parser.on_error(STRING_ERROR);
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-    case '-':
-      return doc_parser.on_error(NUMBER_ERROR);
-    case 't':
-      return doc_parser.on_error(T_ATOM_ERROR);
-    case 'n':
-      return doc_parser.on_error(N_ATOM_ERROR);
-    case 'f':
-      return doc_parser.on_error(F_ATOM_ERROR);
-    default:
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-  }
+        // New with lookup3. We want to catch the case where an non-continuation
+        // follows a leading byte
+        static const int TOO_SHORT_2_3_4 = 0x40; //  (110_|1110|1111) ____    (0___|110_|1111) ____
+        // We also want to catch a continuation that is preceded by an ASCII byte
+        static const int LONELY_CONTINUATION = 0x80; //  0___ ____    01__ ____
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
-    }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_state)) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+        // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
+        // byte 2 to be sure which things are errors and which aren't.
+        // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
+        static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
+        const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+            // ASCII: ________ [0___]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            // ASCII: ________ [0___]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+            // Continuations: ________ [10__]____
+            CARRY | OVERLONG_3 | OVERLONG_4 | LONELY_CONTINUATION, // ________ [1000]____
+            CARRY | OVERLONG_3 | TOO_LARGE | LONELY_CONTINUATION,  // ________ [1001]____
+            CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION,   // ________ [1010]____
+            CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION,   // ________ [1011]____
+            // Multibyte Leads: ________ [11__]____
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4, // 110_
+            CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4);
+        const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+            // [0___]____ (ASCII)
+            LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+            LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+            // [10__]____ (continuation)
+            0, 0, 0, 0,
+            // [11__]____ (2+-byte leads)
+            OVERLONG_2 | TOO_SHORT_2_3_4, TOO_SHORT_2_3_4,         // [110_]____ (2-byte lead)
+            OVERLONG_3 | SURROGATE | TOO_SHORT_2_3_4,              // [1110]____ (3-byte lead)
+            OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 | TOO_SHORT_2_3_4 // [1111]____ (4+-byte lead)
+        );
+        const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+            // ____[00__] ________
+            OVERLONG_2 | OVERLONG_3 | OVERLONG_4 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0000] ________
+            OVERLONG_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,                           // ____[0001] ________
+            TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[01__] ________
+            TOO_LARGE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0100] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[10__] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            // ____[11__] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+            TOO_LARGE_2 | SURROGATE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[1101] ________
+            TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION);
+        return byte_1_high & byte_1_low & byte_2_high;
+      }
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
-  }
-};
+      really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input,
+                                                           simd8<uint8_t> prev1)
+      {
+        simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+        simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+        // is_2_3_continuation uses one more instruction than lookup2
+        simd8<bool> is_2_3_continuation = (simd8<int8_t>(input).max(simd8<int8_t>(prev1))) < int8_t(-64);
+        // must_be_2_3_continuation has two fewer instructions than lookup 2
+        return simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3) ^ is_2_3_continuation);
+      }
 
-// Redefine FAIL_IF to use goto since it'll be used inside the function now
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
+      //
+      // Return nonzero if there are incomplete multibyte characters at the end of the block:
+      // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+      //
+      really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input)
+      {
+        // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+        // ... 1111____ 111_____ 11______
+        static const uint8_t max_array[32] = {
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1};
+        const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+        return input.gt_bits(max_value);
+      }
 
-} // namespace stage2
+      struct utf8_checker
+      {
+        // If this is nonzero, there has been a UTF-8 error.
+        simd8<uint8_t> error;
+        // The last input we received
+        simd8<uint8_t> prev_input_block;
+        // Whether the last input we received was incomplete (used for ASCII fast path)
+        simd8<uint8_t> prev_incomplete;
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
+        //
+        // Check whether the current bytes are valid UTF-8.
+        //
+        really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+        {
+          // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+          // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+          simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+          this->error |= check_special_cases(input, prev1);
+          this->error |= check_multibyte_lengths(input, prev_input, prev1);
+        }
 
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+        // The only problem that can happen at EOF is that a multibyte character is too short.
+        really_inline void check_eof()
+        {
+          // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+          // possibly finish them.
+          this->error |= this->prev_incomplete;
+        }
 
-//
-// Object parser states
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+        really_inline void check_next_input(simd8x64<uint8_t> input)
+        {
+          if (likely(is_ascii(input)))
+          {
+            // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+            // possibly finish them.
+            this->error |= this->prev_incomplete;
+          }
+          else
+          {
+            this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+            for (int i = 1; i < simd8x64<uint8_t>::NUM_CHUNKS; i++)
+            {
+              this->check_utf8_bytes(input.chunks[i], input.chunks[i - 1]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+          }
+        }
 
-object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+        really_inline error_code errors()
+        {
+          return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+        }
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+      }; // struct utf8_checker
+    }    // namespace utf8_validation
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+    using utf8_validation::utf8_checker;
+    /* end file src/generic/stage1/utf8_lookup3_algorithm.h */
+    /* begin file src/generic/stage1/json_structural_indexer.h */
+    // This file contains the common code every implementation uses in stage1
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is included already includes
+    // "simdjson/stage1.h" (this simplifies amalgation)
 
-//
-// Array parser states
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
+    namespace stage1
+    {
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+      class bit_indexer
+      {
+      public:
+        uint32_t *tail;
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+        really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
-finish:
-  return parser.finish();
+        // flatten out values in 'bits' assuming that they are are to have values of idx
+        // plus their position in the bitvector, and store these indexes at
+        // base_ptr[base] incrementing base as we go
+        // will potentially store extra values beyond end of valid bits, so base_ptr
+        // needs to be large enough to handle this
+        really_inline void write(uint32_t idx, uint64_t bits)
+        {
+          // In some instances, the next branch is expensive because it is mispredicted.
+          // Unfortunately, in other cases,
+          // it helps tremendously.
+          if (bits == 0)
+            return;
+          int cnt = static_cast<int>(count_ones(bits));
 
-error:
-  return parser.error();
-}
+          // Do the first 8 all together
+          for (int i = 0; i < 8; i++)
+          {
+            this->tail[i] = idx + trailing_zeroes(bits);
+            bits = clear_lowest_bit(bits);
+          }
 
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
-  }
-  return code;
-}
-/* end file src/generic/stage2_build_tape.h */
-/* begin file src/generic/stage2_streaming_build_tape.h */
-namespace stage2 {
+          // Do the next 8 all together (we hope in most cases it won't happen at all
+          // and the branch is easily predicted).
+          if (unlikely(cnt > 8))
+          {
+            for (int i = 8; i < 16; i++)
+            {
+              this->tail[i] = idx + trailing_zeroes(bits);
+              bits = clear_lowest_bit(bits);
+            }
 
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, parser &_doc_parser, size_t _i) : structural_parser(_buf, _len, _doc_parser, _i) {}
+            // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+            // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+            // or the start of a value ("abc" true 123) every four characters.
+            if (unlikely(cnt > 16))
+            {
+              int i = 16;
+              do
+              {
+                this->tail[i] = idx + trailing_zeroes(bits);
+                bits = clear_lowest_bit(bits);
+                i++;
+              } while (i < cnt);
+            }
+          }
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+          this->tail += cnt;
+        }
+      };
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return doc_parser.on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
-  }
-};
+      class json_structural_indexer
+      {
+      public:
+        /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+        template <size_t STEP_SIZE>
+        static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
 
-} // namespace stage2
+      private:
+        really_inline json_structural_indexer(uint32_t *structural_indexes);
+        template <size_t STEP_SIZE>
+        really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+        really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
+        really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, next_json);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+        json_scanner scanner{};
+        utf8_checker checker{};
+        bit_indexer indexer;
+        uint64_t prev_structurals = 0;
+        uint64_t unescaped_chars_error = 0;
+      };
 
-//
-// Object parser parsers
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+      really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
 
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+      //
+      // PERF NOTES:
+      // We pipe 2 inputs through these stages:
+      // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+      //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+      // 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+      //    The output of step 1 depends entirely on this information. These functions don't quite use
+      //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+      //    at a time. The second input's scans has some dependency on the first ones finishing it, but
+      //    they can make a lot of progress before they need that information.
+      // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+      //    to finish: utf-8 checks and generating the output from the last iteration.
+      //
+      // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+      // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+      // workout.
+      //
+      template <size_t STEP_SIZE>
+      error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept
+      {
+        if (unlikely(len > parser.capacity()))
+        {
+          return CAPACITY;
+        }
+        if (partial)
+        {
+          len = trim_partial_utf8(buf, len);
+        }
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+        buf_block_reader<STEP_SIZE> reader(buf, len);
+        json_structural_indexer indexer(parser.structural_indexes.get());
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+        // Read all but the last block
+        while (reader.has_full_block())
+        {
+          indexer.step<STEP_SIZE>(reader.full_block(), reader);
+        }
 
-//
-// Array parser parsers
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
+        // Take care of the last block (will always be there unless file is empty)
+        uint8_t block[STEP_SIZE];
+        if (unlikely(reader.get_remainder(block) == 0))
+        {
+          return EMPTY;
+        }
+        indexer.step<STEP_SIZE>(block, reader);
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+        return indexer.finish(parser, reader.block_index(), len, partial);
+      }
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+      template <>
+      really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block);
+        simd::simd8x64<uint8_t> in_2(block + 64);
+        json_block block_1 = scanner.next(in_1);
+        json_block block_2 = scanner.next(in_2);
+        this->next(in_1, block_1, reader.block_index());
+        this->next(in_2, block_2, reader.block_index() + 64);
+        reader.advance();
+      }
 
-finish:
-  next_json = parser.structurals.next_structural_index();
-  return parser.finish();
+      template <>
+      really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept
+      {
+        simd::simd8x64<uint8_t> in_1(block);
+        json_block block_1 = scanner.next(in_1);
+        this->next(in_1, block_1, reader.block_index());
+        reader.advance();
+      }
 
-error:
-  return parser.error();
-}
-/* end file src/generic/stage2_streaming_build_tape.h */
+      really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx)
+      {
+        uint64_t unescaped = in.lteq(0x1F);
+        checker.check_next_input(in);
+        indexer.write(uint32_t(idx - 64), prev_structurals); // Output *last* iteration's structurals to the parser
+        prev_structurals = block.structural_start();
+        unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+      }
 
+      really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial)
+      {
+        // Write out the final iteration's structurals
+        indexer.write(uint32_t(idx - 64), prev_structurals);
+
+        error_code error = scanner.finish(partial);
+        if (unlikely(error != SUCCESS))
+        {
+          return error;
+        }
+
+        if (unescaped_chars_error)
+        {
+          return UNESCAPED_CHARS;
+        }
+
+        parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+        /***
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   **/
+        parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+        parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+        parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+        parser.next_structural_index = 0;
+        // a valid JSON file cannot have zero structural indexes - we should have found something
+        if (unlikely(parser.n_structural_indexes == 0u))
+        {
+          return EMPTY;
+        }
+        if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len))
+        {
+          return UNEXPECTED_ERROR;
+        }
+        if (partial)
+        {
+          auto new_structural_indexes = find_next_document_index(parser);
+          if (new_structural_indexes == 0 && parser.n_structural_indexes > 0)
+          {
+            return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+          }
+          parser.n_structural_indexes = new_structural_indexes;
+        }
+        return checker.errors();
+      }
+
+    } // namespace stage1
+    /* end file src/generic/stage1/json_structural_indexer.h */
+    WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept
+    {
+      this->buf = _buf;
+      this->len = _len;
+      return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
+    }
+    /* begin file src/generic/stage1/utf8_validator.h */
+    namespace stage1
+    {
+      /**
+ * Validates that the string is actual UTF-8.
+ */
+      template <class checker>
+      bool generic_validate_utf8(const uint8_t *input, size_t length)
+      {
+        checker c{};
+        buf_block_reader<64> reader(input, length);
+        while (reader.has_full_block())
+        {
+          simd::simd8x64<uint8_t> in(reader.full_block());
+          c.check_next_input(in);
+          reader.advance();
+        }
+        uint8_t block[64]{};
+        reader.get_remainder(block);
+        simd::simd8x64<uint8_t> in(block);
+        c.check_next_input(in);
+        reader.advance();
+        return c.errors() == error_code::SUCCESS;
+      }
+
+      bool generic_validate_utf8(const char *input, size_t length)
+      {
+        return generic_validate_utf8<utf8_checker>((const uint8_t *)input, length);
+      }
+
+    } // namespace stage1
+    /* end file src/generic/stage1/utf8_validator.h */
+    WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept
+    {
+      return simdjson::westmere::stage1::generic_validate_utf8(buf, len);
+    }
+  } // namespace westmere
 } // namespace simdjson
 UNTARGET_REGION
 
-#endif // SIMDJSON_HASWELL_STAGE2_BUILD_TAPE_H
-/* end file src/generic/stage2_streaming_build_tape.h */
-#endif
-#if SIMDJSON_IMPLEMENTATION_WESTMERE
-/* begin file src/westmere/stage2_build_tape.h */
-#ifndef SIMDJSON_WESTMERE_STAGE2_BUILD_TAPE_H
-#define SIMDJSON_WESTMERE_STAGE2_BUILD_TAPE_H
-
-/* westmere/implementation.h already included: #include "westmere/implementation.h" */
+//
+// Stage 2
+//
 /* begin file src/westmere/stringparsing.h */
 #ifndef SIMDJSON_WESTMERE_STRINGPARSING_H
 #define SIMDJSON_WESTMERE_STRINGPARSING_H
 
 /* jsoncharutils.h already included: #include "jsoncharutils.h" */
 /* westmere/simd.h already included: #include "westmere/simd.h" */
 /* westmere/intrinsics.h already included: #include "westmere/intrinsics.h" */
 /* westmere/bitmanipulation.h already included: #include "westmere/bitmanipulation.h" */
 
 TARGET_WESTMERE
-namespace simdjson::westmere {
+namespace simdjson
+{
+  namespace westmere
+  {
 
-using namespace simd;
+    using namespace simd;
 
-// Holds backslashes and quotes locations.
-struct backslash_and_quote {
-public:
-  static constexpr uint32_t BYTES_PROCESSED = 32;
-  really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+    // Holds backslashes and quotes locations.
+    struct backslash_and_quote
+    {
+    public:
+      static constexpr uint32_t BYTES_PROCESSED = 32;
+      really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
 
-  really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
-  really_inline bool has_backslash() { return bs_bits != 0; }
-  really_inline int quote_index() { return trailing_zeroes(quote_bits); }
-  really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
+      really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
+      really_inline bool has_backslash() { return bs_bits != 0; }
+      really_inline int quote_index() { return trailing_zeroes(quote_bits); }
+      really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
 
-  uint32_t bs_bits;
-  uint32_t quote_bits;
-}; // struct backslash_and_quote
+      uint32_t bs_bits;
+      uint32_t quote_bits;
+    }; // struct backslash_and_quote
 
-really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
-  // this can read up to 31 bytes beyond the buffer size, but we require
-  // SIMDJSON_PADDING of padding
-  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1));
-  simd8<uint8_t> v0(src);
-  simd8<uint8_t> v1(src + 16);
-  v0.store(dst);
-  v1.store(dst + 16);
-  uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
-  return {
-    static_cast<uint32_t>(bs_and_quote),      // bs_bits
-    static_cast<uint32_t>(bs_and_quote >> 32) // quote_bits
-  };
-}
+    really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst)
+    {
+      // this can read up to 31 bytes beyond the buffer size, but we require
+      // SIMDJSON_PADDING of padding
+      static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
+      simd8<uint8_t> v0(src);
+      simd8<uint8_t> v1(src + 16);
+      v0.store(dst);
+      v1.store(dst + 16);
+      uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
+      return {
+          uint32_t(bs_and_quote),      // bs_bits
+          uint32_t(bs_and_quote >> 32) // quote_bits
+      };
+    }
 
-/* begin file src/generic/stringparsing.h */
-// This file contains the common code every implementation uses
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "stringparsing.h" (this simplifies amalgation)
+    /* begin file src/generic/stage2/stringparsing.h */
+    // This file contains the common code every implementation uses
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "stringparsing.h" (this simplifies amalgation)
 
-namespace stringparsing {
+    namespace stage2
+    {
+      namespace stringparsing
+      {
 
-// begin copypasta
-// These chars yield themselves: " \ /
-// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
-// u not handled in this table as it's complex
-static const uint8_t escape_map[256] = {
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+        // begin copypasta
+        // These chars yield themselves: " \ /
+        // b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+        // u not handled in this table as it's complex
+        static const uint8_t escape_map[256] = {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x0.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x22,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x2f,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
-    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
-    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x4.
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x5c,
+            0,
+            0,
+            0, // 0x5.
+            0,
+            0,
+            0x08,
+            0,
+            0,
+            0,
+            0x0c,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0x0a,
+            0, // 0x6.
+            0,
+            0,
+            0x0d,
+            0,
+            0x09,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0, // 0x7.
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
 
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
-};
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        };
 
-// handle a unicode codepoint
-// write appropriate values into dest
-// src will advance 6 bytes or 12 bytes
-// dest will advance a variable amount (return via pointer)
-// return true if the unicode codepoint was valid
-// We work in little-endian then swap at write time
-WARN_UNUSED
-really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
-                                            uint8_t **dst_ptr) {
-  // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
-  // conversion isn't valid; we defer the check for this to inside the
-  // multilingual plane check
-  uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
-  *src_ptr += 6;
-  // check for low surrogate for characters outside the Basic
-  // Multilingual Plane.
-  if (code_point >= 0xd800 && code_point < 0xdc00) {
-    if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
-      return false;
-    }
-    uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
+        // handle a unicode codepoint
+        // write appropriate values into dest
+        // src will advance 6 bytes or 12 bytes
+        // dest will advance a variable amount (return via pointer)
+        // return true if the unicode codepoint was valid
+        // We work in little-endian then swap at write time
+        WARN_UNUSED
+        really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                                    uint8_t **dst_ptr)
+        {
+          // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+          // conversion isn't valid; we defer the check for this to inside the
+          // multilingual plane check
+          uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
+          *src_ptr += 6;
+          // check for low surrogate for characters outside the Basic
+          // Multilingual Plane.
+          if (code_point >= 0xd800 && code_point < 0xdc00)
+          {
+            if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u')
+            {
+              return false;
+            }
+            uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
 
-    // if the first code point is invalid we will get here, as we will go past
-    // the check for being outside the Basic Multilingual plane. If we don't
-    // find a \u immediately afterwards we fail out anyhow, but if we do,
-    // this check catches both the case of the first code point being invalid
-    // or the second code point being invalid.
-    if ((code_point | code_point_2) >> 16) {
-      return false;
-    }
+            // if the first code point is invalid we will get here, as we will go past
+            // the check for being outside the Basic Multilingual plane. If we don't
+            // find a \u immediately afterwards we fail out anyhow, but if we do,
+            // this check catches both the case of the first code point being invalid
+            // or the second code point being invalid.
+            if ((code_point | code_point_2) >> 16)
+            {
+              return false;
+            }
 
-    code_point =
-        (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
-    *src_ptr += 6;
-  }
-  size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
-  *dst_ptr += offset;
-  return offset > 0;
-}
+            code_point =
+                (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
+            *src_ptr += 6;
+          }
+          size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
+          *dst_ptr += offset;
+          return offset > 0;
+        }
 
-WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
-  src++;
-  while (1) {
-    // Copy the next n bytes, and find the backslash and quote in them.
-    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
-    // If the next thing is the end quote, copy and return
-    if (bs_quote.has_quote_first()) {
-      // we encountered quotes first. Move dst to point to quotes and exit
-      return dst + bs_quote.quote_index();
-    }
-    if (bs_quote.has_backslash()) {
-      /* find out where the backspace is */
-      auto bs_dist = bs_quote.backslash_index();
-      uint8_t escape_char = src[bs_dist + 1];
-      /* we encountered backslash first. Handle backslash */
-      if (escape_char == 'u') {
-        /* move src/dst up to the start; they will be further adjusted
+        WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst)
+        {
+          src++;
+          while (1)
+          {
+            // Copy the next n bytes, and find the backslash and quote in them.
+            auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+            // If the next thing is the end quote, copy and return
+            if (bs_quote.has_quote_first())
+            {
+              // we encountered quotes first. Move dst to point to quotes and exit
+              return dst + bs_quote.quote_index();
+            }
+            if (bs_quote.has_backslash())
+            {
+              /* find out where the backspace is */
+              auto bs_dist = bs_quote.backslash_index();
+              uint8_t escape_char = src[bs_dist + 1];
+              /* we encountered backslash first. Handle backslash */
+              if (escape_char == 'u')
+              {
+                /* move src/dst up to the start; they will be further adjusted
            within the unicode codepoint handling code. */
-        src += bs_dist;
-        dst += bs_dist;
-        if (!handle_unicode_codepoint(&src, &dst)) {
-          return nullptr;
-        }
-      } else {
-        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+                src += bs_dist;
+                dst += bs_dist;
+                if (!handle_unicode_codepoint(&src, &dst))
+                {
+                  return nullptr;
+                }
+              }
+              else
+              {
+                /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
          * write bs_dist+1 characters to output
          * note this may reach beyond the part of the buffer we've actually
          * seen. I think this is ok */
-        uint8_t escape_result = escape_map[escape_char];
-        if (escape_result == 0u) {
-          return nullptr; /* bogus escape value is an error */
-        }
-        dst[bs_dist] = escape_result;
-        src += bs_dist + 2;
-        dst += bs_dist + 1;
-      }
-    } else {
-      /* they are the same. Since they can't co-occur, it means we
+                uint8_t escape_result = escape_map[escape_char];
+                if (escape_result == 0u)
+                {
+                  return nullptr; /* bogus escape value is an error */
+                }
+                dst[bs_dist] = escape_result;
+                src += bs_dist + 2;
+                dst += bs_dist + 1;
+              }
+            }
+            else
+            {
+              /* they are the same. Since they can't co-occur, it means we
        * encountered neither. */
-      src += backslash_and_quote::BYTES_PROCESSED;
-      dst += backslash_and_quote::BYTES_PROCESSED;
-    }
-  }
-  /* can't be reached */
-  return nullptr;
-}
+              src += backslash_and_quote::BYTES_PROCESSED;
+              dst += backslash_and_quote::BYTES_PROCESSED;
+            }
+          }
+          /* can't be reached */
+          return nullptr;
+        }
 
-} // namespace stringparsing
-/* end file src/generic/stringparsing.h */
+      } // namespace stringparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/stringparsing.h */
 
-} // namespace simdjson::westmere
+  } // namespace westmere
+} // namespace simdjson
 UNTARGET_REGION
 
 #endif // SIMDJSON_WESTMERE_STRINGPARSING_H
-/* end file src/generic/stringparsing.h */
+/* end file src/generic/stage2/stringparsing.h */
 /* begin file src/westmere/numberparsing.h */
 #ifndef SIMDJSON_WESTMERE_NUMBERPARSING_H
 #define SIMDJSON_WESTMERE_NUMBERPARSING_H
 
 /* jsoncharutils.h already included: #include "jsoncharutils.h" */
 /* westmere/intrinsics.h already included: #include "westmere/intrinsics.h" */
 /* westmere/bitmanipulation.h already included: #include "westmere/bitmanipulation.h" */
 #include <cmath>
 #include <limits>
 
-
 #ifdef JSON_TEST_NUMBERS // for unit testing
 void found_invalid_number(const uint8_t *buf);
 void found_integer(int64_t result, const uint8_t *buf);
 void found_unsigned_integer(uint64_t result, const uint8_t *buf);
 void found_float(double result, const uint8_t *buf);
 #endif
 
-
 TARGET_WESTMERE
-namespace simdjson::westmere {
-static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
-  // this actually computes *16* values so we are being wasteful.
-  const __m128i ascii0 = _mm_set1_epi8('0');
-  const __m128i mul_1_10 =
-      _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
-  const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
-  const __m128i mul_1_10000 =
-      _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
-  const __m128i input = _mm_sub_epi8(
-      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
-  const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
-  const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
-  const __m128i t3 = _mm_packus_epi32(t2, t2);
-  const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
-  return _mm_cvtsi128_si32(
-      t4); // only captures the sum of the first 8 digits, drop the rest
-}
+namespace simdjson
+{
+  namespace westmere
+  {
+    static inline uint32_t parse_eight_digits_unrolled(const char *chars)
+    {
+      // this actually computes *16* values so we are being wasteful.
+      const __m128i ascii0 = _mm_set1_epi8('0');
+      const __m128i mul_1_10 =
+          _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
+      const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
+      const __m128i mul_1_10000 =
+          _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
+      const __m128i input = _mm_sub_epi8(
+          _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
+      const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
+      const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
+      const __m128i t3 = _mm_packus_epi32(t2, t2);
+      const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
+      return _mm_cvtsi128_si32(
+          t4); // only captures the sum of the first 8 digits, drop the rest
+    }
 
 #define SWAR_NUMBER_PARSING
 
-/* begin file src/generic/numberparsing.h */
-namespace numberparsing {
+    /* begin file src/generic/stage2/numberparsing.h */
+    namespace stage2
+    {
+      namespace numberparsing
+      {
 
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), writer.append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), writer.append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), writer.append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (false)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) writer.append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) writer.append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) writer.append_double((VALUE))
+#endif
 
-// Attempts to compute i * 10^(power) exactly; and if "negative" is
-// true, negate the result.
-// This function will only work in some cases, when it does not work, success is
-// set to false. This should work *most of the time* (like 99% of the time).
-// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
-// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
-really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
-                                      bool *success) {
-  // we start with a fast path
-  // It was described in
-  // Clinger WD. How to read floating point numbers accurately.
-  // ACM SIGPLAN Notices. 1990
-  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
-    // convert the integer into a double. This is lossless since
-    // 0 <= i <= 2^53 - 1.
-    double d = i;
-    //
-    // The general idea is as follows.
-    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
-    // 1) Both s and p can be represented exactly as 64-bit floating-point
-    // values
-    // (binary64).
-    // 2) Because s and p can be represented exactly as floating-point values,
-    // then s * p
-    // and s / p will produce correctly rounded values.
-    //
-    if (power < 0) {
-      d = d / power_of_ten[-power];
-    } else {
-      d = d * power_of_ten[power];
-    }
-    if (negative) {
-      d = -d;
-    }
-    *success = true;
-    return d;
-  }
-  // When 22 < power && power <  22 + 16, we could
-  // hope for another, secondary fast path.  It wa
-  // described by David M. Gay in  "Correctly rounded
-  // binary-decimal and decimal-binary conversions." (1990)
-  // If you need to compute i * 10^(22 + x) for x < 16,
-  // first compute i * 10^x, if you know that result is exact
-  // (e.g., when i * 10^x < 2^53),
-  // then you can still proceed and do (i * 10^x) * 10^22.
-  // Is this worth your time?
-  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
-  // for this second fast path to work.
-  // If you you have 22 < power *and* power <  22 + 16, and then you
-  // optimistically compute "i * 10^(x-22)", there is still a chance that you
-  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
-  // this optimization maybe less common than we would like. Source:
-  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
-  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+        // Attempts to compute i * 10^(power) exactly; and if "negative" is
+        // true, negate the result.
+        // This function will only work in some cases, when it does not work, success is
+        // set to false. This should work *most of the time* (like 99% of the time).
+        // We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
+        // FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
+        really_inline double compute_float_64(int64_t power, uint64_t i, bool negative, bool *success)
+        {
+          // we start with a fast path
+          // It was described in
+          // Clinger WD. How to read floating point numbers accurately.
+          // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+          // We cannot be certain that x/y is rounded to nearest.
+          if (0 <= power && power <= 22 && i <= 9007199254740991)
+          {
+#else
+          if (-22 <= power && power <= 22 && i <= 9007199254740991)
+          {
+#endif
+            // convert the integer into a double. This is lossless since
+            // 0 <= i <= 2^53 - 1.
+            double d = double(i);
+            //
+            // The general idea is as follows.
+            // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+            // 1) Both s and p can be represented exactly as 64-bit floating-point
+            // values
+            // (binary64).
+            // 2) Because s and p can be represented exactly as floating-point values,
+            // then s * p
+            // and s / p will produce correctly rounded values.
+            //
+            if (power < 0)
+            {
+              d = d / power_of_ten[-power];
+            }
+            else
+            {
+              d = d * power_of_ten[power];
+            }
+            if (negative)
+            {
+              d = -d;
+            }
+            *success = true;
+            return d;
+          }
+          // When 22 < power && power <  22 + 16, we could
+          // hope for another, secondary fast path.  It wa
+          // described by David M. Gay in  "Correctly rounded
+          // binary-decimal and decimal-binary conversions." (1990)
+          // If you need to compute i * 10^(22 + x) for x < 16,
+          // first compute i * 10^x, if you know that result is exact
+          // (e.g., when i * 10^x < 2^53),
+          // then you can still proceed and do (i * 10^x) * 10^22.
+          // Is this worth your time?
+          // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+          // for this second fast path to work.
+          // If you you have 22 < power *and* power <  22 + 16, and then you
+          // optimistically compute "i * 10^(x-22)", there is still a chance that you
+          // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+          // this optimization maybe less common than we would like. Source:
+          // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+          // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
 
-  // The fast path has now failed, so we are failing back on the slower path.
+          // The fast path has now failed, so we are failing back on the slower path.
 
-  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
-  // possible, except if i == 0, so we handle i == 0 separately.
-  if(i == 0) {
-    return 0.0;
-  }
+          // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+          // possible, except if i == 0, so we handle i == 0 separately.
+          if (i == 0)
+          {
+            return 0.0;
+          }
 
-  // We are going to need to do some 64-bit arithmetic to get a more precise product.
-  // We use a table lookup approach.
-  components c =
-      power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
-      // safe because
-      // power >= FASTFLOAT_SMALLEST_POWER
-      // and power <= FASTFLOAT_LARGEST_POWER
-  // we recover the mantissa of the power, it has a leading 1. It is always
-  // rounded down.
-  uint64_t factor_mantissa = c.mantissa;
+          // We are going to need to do some 64-bit arithmetic to get a more precise product.
+          // We use a table lookup approach.
+          components c =
+              power_of_ten_components[power - FASTFLOAT_SMALLEST_POWER];
+          // safe because
+          // power >= FASTFLOAT_SMALLEST_POWER
+          // and power <= FASTFLOAT_LARGEST_POWER
+          // we recover the mantissa of the power, it has a leading 1. It is always
+          // rounded down.
+          uint64_t factor_mantissa = c.mantissa;
 
-  // We want the most significant bit of i to be 1. Shift if needed.
-  int lz = leading_zeroes(i);
-  i <<= lz;
-  // We want the most significant 64 bits of the product. We know
-  // this will be non-zero because the most significant bit of i is
-  // 1.
-  value128 product = full_multiplication(i, factor_mantissa);
-  uint64_t lower = product.low;
-  uint64_t upper = product.high;
+          // We want the most significant bit of i to be 1. Shift if needed.
+          int lz = leading_zeroes(i);
+          i <<= lz;
+          // We want the most significant 64 bits of the product. We know
+          // this will be non-zero because the most significant bit of i is
+          // 1.
+          value128 product = full_multiplication(i, factor_mantissa);
+          uint64_t lower = product.low;
+          uint64_t upper = product.high;
 
-  // We know that upper has at most one leading zero because
-  // both i and  factor_mantissa have a leading one. This means
-  // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
+          // We know that upper has at most one leading zero because
+          // both i and  factor_mantissa have a leading one. This means
+          // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
 
-  // As long as the first 9 bits of "upper" are not "1", then we
-  // know that we have an exact computed value for the leading
-  // 55 bits because any imprecision would play out as a +1, in
-  // the worst case.
-  if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower)) {
-    uint64_t factor_mantissa_low =
-        mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
-    // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
-    // result (three 64-bit values)
-    product = full_multiplication(i, factor_mantissa_low);
-    uint64_t product_low = product.low;
-    uint64_t product_middle2 = product.high;
-    uint64_t product_middle1 = lower;
-    uint64_t product_high = upper;
-    uint64_t product_middle = product_middle1 + product_middle2;
-    if (product_middle < product_middle1) {
-      product_high++; // overflow carry
-    }
-    // We want to check whether mantissa *i + i would affect our result.
-    // This does happen, e.g. with 7.3177701707893310e+15.
-    if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
-         (product_low + i < product_low))) { // let us be prudent and bail out.
-      *success = false;
-      return 0;
-    }
-    upper = product_high;
-    lower = product_middle;
-  }
-  // The final mantissa should be 53 bits with a leading 1.
-  // We shift it so that it occupies 54 bits with a leading 1.
-  ///////
-  uint64_t upperbit = upper >> 63;
-  uint64_t mantissa = upper >> (upperbit + 9);
-  lz += 1 ^ upperbit;
+          // As long as the first 9 bits of "upper" are not "1", then we
+          // know that we have an exact computed value for the leading
+          // 55 bits because any imprecision would play out as a +1, in
+          // the worst case.
+          if (unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower))
+          {
+            uint64_t factor_mantissa_low =
+                mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
+            // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
+            // result (three 64-bit values)
+            product = full_multiplication(i, factor_mantissa_low);
+            uint64_t product_low = product.low;
+            uint64_t product_middle2 = product.high;
+            uint64_t product_middle1 = lower;
+            uint64_t product_high = upper;
+            uint64_t product_middle = product_middle1 + product_middle2;
+            if (product_middle < product_middle1)
+            {
+              product_high++; // overflow carry
+            }
+            // We want to check whether mantissa *i + i would affect our result.
+            // This does happen, e.g. with 7.3177701707893310e+15.
+            if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
+                 (product_low + i < product_low)))
+            { // let us be prudent and bail out.
+              *success = false;
+              return 0;
+            }
+            upper = product_high;
+            lower = product_middle;
+          }
+          // The final mantissa should be 53 bits with a leading 1.
+          // We shift it so that it occupies 54 bits with a leading 1.
+          ///////
+          uint64_t upperbit = upper >> 63;
+          uint64_t mantissa = upper >> (upperbit + 9);
+          lz += int(1 ^ upperbit);
 
-  // Here we have mantissa < (1<<54).
+          // Here we have mantissa < (1<<54).
 
-  // We have to round to even. The "to even" part
-  // is only a problem when we are right in between two floats
-  // which we guard against.
-  // If we have lots of trailing zeros, we may fall right between two
-  // floating-point values.
-  if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
-               ((mantissa & 3) == 1))) {
-      // if mantissa & 1 == 1 we might need to round up.
-      //
-      // Scenarios:
-      // 1. We are not in the middle. Then we should round up.
-      //
-      // 2. We are right in the middle. Whether we round up depends
-      // on the last significant bit: if it is "one" then we round
-      // up (round to even) otherwise, we do not.
-      //
-      // So if the last significant bit is 1, we can safely round up.
-      // Hence we only need to bail out if (mantissa & 3) == 1.
-      // Otherwise we may need more accuracy or analysis to determine whether
-      // we are exactly between two floating-point numbers.
-      // It can be triggered with 1e23.
-      // Note: because the factor_mantissa and factor_mantissa_low are
-      // almost always rounded down (except for small positive powers),
-      // almost always should round up.
-      *success = false;
-      return 0;
-  }
+          // We have to round to even. The "to even" part
+          // is only a problem when we are right in between two floats
+          // which we guard against.
+          // If we have lots of trailing zeros, we may fall right between two
+          // floating-point values.
+          if (unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
+                       ((mantissa & 3) == 1)))
+          {
+            // if mantissa & 1 == 1 we might need to round up.
+            //
+            // Scenarios:
+            // 1. We are not in the middle. Then we should round up.
+            //
+            // 2. We are right in the middle. Whether we round up depends
+            // on the last significant bit: if it is "one" then we round
+            // up (round to even) otherwise, we do not.
+            //
+            // So if the last significant bit is 1, we can safely round up.
+            // Hence we only need to bail out if (mantissa & 3) == 1.
+            // Otherwise we may need more accuracy or analysis to determine whether
+            // we are exactly between two floating-point numbers.
+            // It can be triggered with 1e23.
+            // Note: because the factor_mantissa and factor_mantissa_low are
+            // almost always rounded down (except for small positive powers),
+            // almost always should round up.
+            *success = false;
+            return 0;
+          }
 
-  mantissa += mantissa & 1;
-  mantissa >>= 1;
+          mantissa += mantissa & 1;
+          mantissa >>= 1;
 
-  // Here we have mantissa < (1<<53), unless there was an overflow
-  if (mantissa >= (1ULL << 53)) {
-    //////////
-    // This will happen when parsing values such as 7.2057594037927933e+16
-    ////////
-    mantissa = (1ULL << 52);
-    lz--; // undo previous addition
-  }
-  mantissa &= ~(1ULL << 52);
-  uint64_t real_exponent = c.exp - lz;
-  // we have to check that real_exponent is in range, otherwise we bail out
-  if (unlikely((real_exponent < 1) || (real_exponent > 2046))) {
-    *success = false;
-    return 0;
-  }
-  mantissa |= real_exponent << 52;
-  mantissa |= (((uint64_t)negative) << 63);
-  double d;
-  memcpy(&d, &mantissa, sizeof(d));
-  *success = true;
-  return d;
-}
+          // Here we have mantissa < (1<<53), unless there was an overflow
+          if (mantissa >= (1ULL << 53))
+          {
+            //////////
+            // This will happen when parsing values such as 7.2057594037927933e+16
+            ////////
+            mantissa = (1ULL << 52);
+            lz--; // undo previous addition
+          }
+          mantissa &= ~(1ULL << 52);
+          uint64_t real_exponent = c.exp - lz;
+          // we have to check that real_exponent is in range, otherwise we bail out
+          if (unlikely((real_exponent < 1) || (real_exponent > 2046)))
+          {
+            *success = false;
+            return 0;
+          }
+          mantissa |= real_exponent << 52;
+          mantissa |= (((uint64_t)negative) << 63);
+          double d;
+          memcpy(&d, &mantissa, sizeof(d));
+          *success = true;
+          return d;
+        } // namespace numberparsing
 
-static bool parse_float_strtod(const char *ptr, double *outDouble) {
-  char *endptr;
-  *outDouble = strtod(ptr, &endptr);
-  // Some libraries will set errno = ERANGE when the value is subnormal,
-  // yet we may want to be able to parse subnormal values.
-  // However, we do not want to tolerate NAN or infinite values.
-  //
-  // Values like infinity or NaN are not allowed in the JSON specification.
-  // If you consume a large value and you map it to "infinity", you will no
-  // longer be able to serialize back a standard-compliant JSON. And there is
-  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
-  // 10^308 It is an unimaginable large number. There will never be any piece of
-  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
-  // of electrons is similar. Using a double-precision floating-point value, we
-  // can represent easily the number of atoms in the universe. We could  also
-  // represent the number of ways you can pick any three individual atoms at
-  // random in the universe. If you ever encounter a number much larger than
-  // 10^308, you know that you have a bug. RapidJSON will reject a document with
-  // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
-  // will flat out throw an exception.
-  //
-  if ((endptr == ptr) || (!std::isfinite(*outDouble))) {
-    return false;
-  }
-  return true;
-}
+        static bool parse_float_strtod(const char *ptr, double *outDouble)
+        {
+          char *endptr;
+          *outDouble = strtod(ptr, &endptr);
+          // Some libraries will set errno = ERANGE when the value is subnormal,
+          // yet we may want to be able to parse subnormal values.
+          // However, we do not want to tolerate NAN or infinite values.
+          //
+          // Values like infinity or NaN are not allowed in the JSON specification.
+          // If you consume a large value and you map it to "infinity", you will no
+          // longer be able to serialize back a standard-compliant JSON. And there is
+          // no realistic application where you might need values so large than they
+          // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+          // 10^308 It is an unimaginable large number. There will never be any piece of
+          // engineering involving as many as 10^308 parts. It is estimated that there
+          // are about 10^80 atoms in the universe.  The estimate for the total number
+          // of electrons is similar. Using a double-precision floating-point value, we
+          // can represent easily the number of atoms in the universe. We could  also
+          // represent the number of ways you can pick any three individual atoms at
+          // random in the universe. If you ever encounter a number much larger than
+          // 10^308, you know that you have a bug. RapidJSON will reject a document with
+          // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
+          // will flat out throw an exception.
+          //
+          if ((endptr == ptr) || (!std::isfinite(*outDouble)))
+          {
+            return false;
+          }
+          return true;
+        }
 
-really_inline bool is_integer(char c) {
-  return (c >= '0' && c <= '9');
-  // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
-}
+        really_inline bool is_integer(char c)
+        {
+          return (c >= '0' && c <= '9');
+          // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
+        }
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+        // check quickly whether the next 8 chars are made of digits
+        // at a glance, it looks better than Mula's
+        // http://0x80.pl/articles/swar-digits-validate.html
+        really_inline bool is_made_of_eight_digits_fast(const char *chars)
+        {
+          uint64_t val;
+          // this can read up to 7 bytes beyond the buffer size, but we require
+          // SIMDJSON_PADDING of padding
+          static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+          memcpy(&val, chars, 8);
+          // a branchy method might be faster:
+          // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+          //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+          //  0x3030303030303030);
+          return (((val & 0xF0F0F0F0F0F0F0F0) |
+                   (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+                  0x3333333333333333);
+        }
 
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
+        template <typename W>
+        bool slow_float_parsing(UNUSED const char *src, W writer)
+        {
+          double d;
+          if (parse_float_strtod(src, &d))
+          {
+            WRITE_DOUBLE(d, (const uint8_t *)src, writer);
+            return true;
+          }
+          return INVALID_NUMBER((const uint8_t *)src);
+        }
 
-// check quickly whether the next 8 chars are made of digits
-// at a glance, it looks better than Mula's
-// http://0x80.pl/articles/swar-digits-validate.html
-really_inline bool is_made_of_eight_digits_fast(const char *chars) {
-  uint64_t val;
-  // this can read up to 7 bytes beyond the buffer size, but we require
-  // SIMDJSON_PADDING of padding
-  static_assert(7 <= SIMDJSON_PADDING);
-  memcpy(&val, chars, 8);
-  // a branchy method might be faster:
-  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
-  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
-  //  0x3030303030303030);
-  return (((val & 0xF0F0F0F0F0F0F0F0) |
-           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
-          0x3333333333333333);
-}
+        really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent)
+        {
+          // we continue with the fiction that we have an integer. If the
+          // floating point number is representable as x * 10^z for some integer
+          // z that fits in 53 bits, then we will be able to convert back the
+          // the integer into a float in a lossless manner.
+          const char *const first_after_period = p;
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          } // There must be at least one digit after the .
 
-// called by parse_number when we know that the output is an integer,
-// but where there might be some integer overflow.
-// we want to catch overflows!
-// Do not call this function directly as it skips some of the checks from
-// parse_number
-//
-// This function will almost never be called!!!
-//
-never_inline bool parse_large_integer(const uint8_t *const src,
-                                      parser &parser,
-                                      bool found_minus) {
-  const char *p = reinterpret_cast<const char *>(src);
-
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-  }
-  uint64_t i;
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    i = 0;
-  } else {
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      if (mul_overflow(i, 10, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          ++p;
+          i = i * 10 + digit; // might overflow + multiplication by 10 is likely
+                              // cheaper than arbitrary mult.
+          // we will handle the overflow later
+#ifdef SWAR_NUMBER_PARSING
+          // this helps if we have lots of decimals!
+          // this turns out to be frequent enough.
+          if (is_made_of_eight_digits_fast(p))
+          {
+            i = i * 100000000 + parse_eight_digits_unrolled(p);
+            p += 8;
+          }
 #endif
-        return false; // overflow
-      }
-      if (add_overflow(i, digit, &i)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false; // overflow
-      }
-      ++p;
-    }
-  }
-  if (negative) {
-    if (i > 0x8000000000000000) {
-      // overflows!
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false; // overflow
-    } else if (i == 0x8000000000000000) {
-      // In two's complement, we cannot represent 0x8000000000000000
-      // as a positive signed integer, but the negative version is
-      // possible.
-      constexpr int64_t signed_answer = INT64_MIN;
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    } else {
-      // we can negate safely
-      int64_t signed_answer = -static_cast<int64_t>(i);
-      parser.on_number_s64(signed_answer);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, src);
-#endif
-    }
-  } else {
-    // we have a positive integer, the contract is that
-    // we try to represent it as a signed integer and only
-    // fallback on unsigned integers if absolutely necessary.
-    if (i < 0x8000000000000000) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(i, src);
-#endif
-      parser.on_number_s64(i);
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_unsigned_integer(i, src);
-#endif
-      parser.on_number_u64(i);
-    }
-  }
-  return is_structural_or_whitespace(*p);
-}
+          while (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            ++p;
+            i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+                                // because we have parse_highprecision_float later.
+          }
+          exponent = first_after_period - p;
+          return true;
+        }
 
-bool slow_float_parsing(UNUSED const char * src, parser &parser) {
-  double d;
-  if (parse_float_strtod(src, &d)) {
-    parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_float(d, (const uint8_t *)src);
-#endif
-    return true;
-  }
-#ifdef JSON_TEST_NUMBERS // for unit testing
-  found_invalid_number((const uint8_t *)src);
-#endif
-  return false;
-}
+        really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent)
+        {
+          bool neg_exp = false;
+          if ('-' == *p)
+          {
+            neg_exp = true;
+            ++p;
+          }
+          else if ('+' == *p)
+          {
+            ++p;
+          }
 
-// parse the number at src
-// define JSON_TEST_NUMBERS for unit testing
-//
-// It is assumed that the number is followed by a structural ({,},],[) character
-// or a white space character. If that is not the case (e.g., when the JSON
-// document is made of a single number), then it is necessary to copy the
-// content and append a space before calling this function.
-//
-// Our objective is accurate parsing (ULP of 0) at high speed.
-really_inline bool parse_number(UNUSED const uint8_t *const src,
-                                UNUSED bool found_minus,
-                                parser &parser) {
-#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
+          // e[+-] must be followed by a number
+          if (!is_integer(*p))
+          {
+            return INVALID_NUMBER(src);
+          }
+          unsigned char digit = static_cast<unsigned char>(*p - '0');
+          int64_t exp_number = digit;
+          p++;
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          if (is_integer(*p))
+          {
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          while (is_integer(*p))
+          {
+            // we need to check for overflows; we refuse to parse this
+            if (exp_number > 0x100000000)
+            {
+              return INVALID_NUMBER(src);
+            }
+            digit = static_cast<unsigned char>(*p - '0');
+            exp_number = 10 * exp_number + digit;
+            ++p;
+          }
+          exponent += (neg_exp ? -exp_number : exp_number);
+          return true;
+        }
+
+        template <typename W>
+        really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char *start_digits, int digit_count, int64_t exponent, W &writer)
+        {
+          // If we frequently had to deal with long strings of digits,
+          // we could extend our code by using a 128-bit integer instead
+          // of a 64-bit integer. However, this is uncommon in practice.
+          // digit count is off by 1 because of the decimal (assuming there was one).
+          if (unlikely((digit_count - 1 >= 19)))
+          { // this is uncommon
+            // It is possible that the integer had an overflow.
+            // We have to handle the case where we have 0.0000somenumber.
+            const char *start = start_digits;
+            while ((*start == '0') || (*start == '.'))
+            {
+              start++;
+            }
+            // we over-decrement by one when there is a '.'
+            digit_count -= int(start - start_digits);
+            if (digit_count >= 19)
+            {
+              // Ok, chances are good that we had an overflow!
+              // this is almost never going to get called!!!
+              // we start anew, going slowly!!!
+              // This will happen in the following examples:
+              // 10000000000000000000000000000000000000000000e+308
+              // 3.1415926535897932384626433832795028841971693993751
+              //
+              bool success = slow_float_parsing((const char *)src, writer);
+              // The number was already written, but we made a copy of the writer
+              // when we passed it to the parse_large_integer() function, so
+              writer.skip_double();
+              return success;
+            }
+          }
+          // NOTE: it's weird that the unlikely() only wraps half the if, but it seems to get slower any other
+          // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+          // To future reader: we'd love if someone found a better way, or at least could explain this result!
+          if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER))
+          {
+            // this is almost never going to get called!!!
+            // we start anew, going slowly!!!
+            bool success = slow_float_parsing((const char *)src, writer);
+            // The number was already written, but we made a copy of the writer when we passed it to the
+            // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+            writer.skip_double();
+            return success;
+          }
+          bool success = true;
+          double d = compute_float_64(exponent, i, negative, &success);
+          if (!success)
+          {
+            // we are almost never going to get here.
+            if (!parse_float_strtod((const char *)src, &d))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          WRITE_DOUBLE(d, src, writer);
+          return true;
+        }
+
+        // parse the number at src
+        // define JSON_TEST_NUMBERS for unit testing
+        //
+        // It is assumed that the number is followed by a structural ({,},],[) character
+        // or a white space character. If that is not the case (e.g., when the JSON
+        // document is made of a single number), then it is necessary to copy the
+        // content and append a space before calling this function.
+        //
+        // Our objective is accurate parsing (ULP of 0) at high speed.
+        template <typename W>
+        really_inline bool parse_number(UNUSED const uint8_t *const src,
+                                        UNUSED bool found_minus,
+                                        W &writer)
+        {
+#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes \
                                   // useful to skip parsing
-  parser.on_number_s64(0);        // always write zero
-  return true;                    // always succeeds
+          writer.append_s64(0);   // always write zero
+          return true;            // always succeeds
 #else
-  const char *p = reinterpret_cast<const char *>(src);
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-    if (!is_integer(*p)) { // a negative sign must be followed by an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-  }
-  const char *const start_digits = p;
+          const char *p = reinterpret_cast<const char *>(src);
+          bool negative = false;
+          if (found_minus)
+          {
+            ++p;
+            negative = true;
+            // a negative sign must be followed by an integer
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+          const char *const start_digits = p;
 
-  uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
-  if (*p == '0') { // 0 cannot be followed by an integer
-    ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    i = 0;
-  } else {
-    if (!(is_integer(*p))) { // must start with an integer
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    unsigned char digit = *p - '0';
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = *p - '0';
-      // a multiplication by 10 is cheaper than an arbitrary integer
-      // multiplication
-      i = 10 * i + digit; // might overflow, we will handle the overflow later
-      ++p;
-    }
-  }
-  int64_t exponent = 0;
-  bool is_float = false;
-  if ('.' == *p) {
-    is_float = true; // At this point we know that we have a float
-    // we continue with the fiction that we have an integer. If the
-    // floating point number is representable as x * 10^z for some integer
-    // z that fits in 53 bits, then we will be able to convert back the
-    // the integer into a float in a lossless manner.
-    ++p;
-    const char *const first_after_period = p;
-    if (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // might overflow + multiplication by 10 is likely
-                          // cheaper than arbitrary mult.
-      // we will handle the overflow later
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-#ifdef SWAR_NUMBER_PARSING
-    // this helps if we have lots of decimals!
-    // this turns out to be frequent enough.
-    if (is_made_of_eight_digits_fast(p)) {
-      i = i * 100000000 + parse_eight_digits_unrolled(p);
-      p += 8;
-    }
-#endif
-    while (is_integer(*p)) {
-      unsigned char digit = *p - '0';
-      ++p;
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
-                          // because we have parse_highprecision_float later.
-    }
-    exponent = first_after_period - p;
-  }
-  int digit_count =
-      p - start_digits - 1; // used later to guard against overflows
-  int64_t exp_number = 0;   // exponential part
-  if (('e' == *p) || ('E' == *p)) {
-    is_float = true;
-    ++p;
-    bool neg_exp = false;
-    if ('-' == *p) {
-      neg_exp = true;
-      ++p;
-    } else if ('+' == *p) {
-      ++p;
-    }
-    if (!is_integer(*p)) {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-    unsigned char digit = *p - '0';
-    exp_number = digit;
-    p++;
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    if (is_integer(*p)) {
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    while (is_integer(*p)) {
-      if (exp_number > 0x100000000) { // we need to check for overflows
-                                      // we refuse to parse this
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(src);
-#endif
-        return false;
-      }
-      digit = *p - '0';
-      exp_number = 10 * exp_number + digit;
-      ++p;
-    }
-    exponent += (neg_exp ? -exp_number : exp_number);
-  }
-  if (is_float) {
-    // If we frequently had to deal with long strings of digits,
-    // we could extend our code by using a 128-bit integer instead
-    // of a 64-bit integer. However, this is uncommon in practice.
-    if (unlikely((digit_count >= 19))) { // this is uncommon
-      // It is possible that the integer had an overflow.
-      // We have to handle the case where we have 0.0000somenumber.
-      const char *start = start_digits;
-      while ((*start == '0') || (*start == '.')) {
-        start++;
-      }
-      // we over-decrement by one when there is a '.'
-      digit_count -= (start - start_digits);
-      if (digit_count >= 19) {
-        // Ok, chances are good that we had an overflow!
-        // this is almost never going to get called!!!
-        // we start anew, going slowly!!!
-        // This will happen in the following examples:
-        // 10000000000000000000000000000000000000000000e+308
-        // 3.1415926535897932384626433832795028841971693993751
-        //
-        return slow_float_parsing((const char *) src, parser);
-      }
-    }
-    if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
-        (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
-      // this is almost never going to get called!!!
-      // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, parser);
-    }
-    bool success = true;
-    double d = compute_float_64(exponent, i, negative, &success);
-    if (!success) {
-      // we are almost never going to get here.
-      success = parse_float_strtod((const char *)src, &d);
-    }
-    if (success) {
-      parser.on_number_double(d);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_float(d, src);
-#endif
-      return true;
-    } else {
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(src);
-#endif
-      return false;
-    }
-  } else {
-    if (unlikely(digit_count >= 18)) { // this is uncommon!!!
-      // there is a good chance that we had an overflow, so we need
-      // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, parser, found_minus);
-    }
-    i = negative ? 0 - i : i;
-    parser.on_number_s64(i);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-    found_integer(i, src);
-#endif
-  }
-  return is_structural_or_whitespace(*p);
+          uint64_t i; // an unsigned int avoids signed overflows (which are bad)
+          if (*p == '0')
+          {
+            ++p;
+            if (is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // 0 cannot be followed by an integer
+            i = 0;
+          }
+          else
+          {
+            // NOTE: This is a redundant check--either we're negative, in which case we checked whether this
+            // is a digit above, or the caller already determined we start with a digit. But removing this
+            // check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
+            // Please do try yourself, or think of ways to explain it--we'd love to understand :)
+            if (!is_integer(*p))
+            {
+              return INVALID_NUMBER(src);
+            } // must start with an integer
+            unsigned char digit = static_cast<unsigned char>(*p - '0');
+            i = digit;
+            p++;
+            // the is_made_of_eight_digits_fast routine is unlikely to help here because
+            // we rarely see large integer parts like 123456789
+            while (is_integer(*p))
+            {
+              digit = static_cast<unsigned char>(*p - '0');
+              // a multiplication by 10 is cheaper than an arbitrary integer
+              // multiplication
+              i = 10 * i + digit; // might overflow, we will handle the overflow later
+              ++p;
+            }
+          }
+
+          //
+          // Handle floats if there is a . or e (or both)
+          //
+          int64_t exponent = 0;
+          bool is_float = false;
+          if ('.' == *p)
+          {
+            is_float = true;
+            ++p;
+            if (!parse_decimal(src, p, i, exponent))
+            {
+              return false;
+            }
+          }
+          int digit_count = int(p - start_digits); // used later to guard against overflows
+          if (('e' == *p) || ('E' == *p))
+          {
+            is_float = true;
+            ++p;
+            if (!parse_exponent(src, p, exponent))
+            {
+              return false;
+            }
+          }
+          if (is_float)
+          {
+            return write_float(src, negative, i, start_digits, digit_count, exponent, writer);
+          }
+
+          // The longest negative 64-bit number is 19 digits.
+          // The longest positive 64-bit number is 20 digits.
+          // We do it this way so we don't trigger this branch unless we must.
+          int longest_digit_count = negative ? 19 : 20;
+          if (digit_count > longest_digit_count)
+          {
+            return INVALID_NUMBER(src);
+          }
+          if (digit_count == longest_digit_count)
+          {
+            // Anything negative above INT64_MAX is either invalid or INT64_MIN.
+            if (negative && i > uint64_t(INT64_MAX))
+            {
+              // If the number is negative and can't fit in a signed integer, it's invalid.
+              if (i > uint64_t(INT64_MAX) + 1)
+              {
+                return INVALID_NUMBER(src);
+              }
+
+              // If it's negative, it has to be INT64_MAX+1 now (or INT64_MIN).
+              // C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
+              WRITE_INTEGER(INT64_MIN, src, writer);
+              return is_structural_or_whitespace(*p);
+            }
+
+            // Positive overflow check:
+            // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+            //   biggest uint64_t.
+            // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+            //   If we got here, it's a 20 digit number starting with the digit "1".
+            // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+            //   than 1,553,255,926,290,448,384.
+            // - That is smaller than the smallest possible 20-digit number the user could write:
+            //   10,000,000,000,000,000,000.
+            // - Therefore, if the number is positive and lower than that, it's overflow.
+            // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
+            //
+            if (!negative && (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)))
+            {
+              return INVALID_NUMBER(src);
+            }
+          }
+
+          // Write unsigned if it doesn't fit in a signed integer.
+          if (i > uint64_t(INT64_MAX))
+          {
+            WRITE_UNSIGNED(i, src, writer);
+          }
+          else
+          {
+            WRITE_INTEGER(negative ? 0 - i : i, src, writer);
+          }
+          return is_structural_or_whitespace(*p);
+
 #endif // SIMDJSON_SKIPNUMBERPARSING
-}
+        }
 
-} // namespace numberparsing
-/* end file src/generic/numberparsing.h */
+      } // namespace numberparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/numberparsing.h */
 
-} // namespace simdjson::westmere
+  } // namespace westmere
+
+} // namespace simdjson
 UNTARGET_REGION
 
 #endif //  SIMDJSON_WESTMERE_NUMBERPARSING_H
-/* end file src/generic/numberparsing.h */
+/* end file src/generic/stage2/numberparsing.h */
 
 TARGET_WESTMERE
-namespace simdjson::westmere {
+namespace simdjson
+{
+  namespace westmere
+  {
 
-/* begin file src/generic/atomparsing.h */
-namespace atomparsing {
+    /* begin file src/generic/stage2/logger.h */
+    // This is for an internal-only stage 2 specific logger.
+    // Set LOG_ENABLED = true to log what stage 2 is doing!
+    namespace logger
+    {
+      static constexpr const char *DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
 
-really_inline uint32_t string_to_uint32(const char* str) { return *reinterpret_cast<const uint32_t *>(str); }
+      static constexpr const bool LOG_ENABLED = false;
+      static constexpr const int LOG_EVENT_LEN = 30;
+      static constexpr const int LOG_BUFFER_LEN = 20;
+      static constexpr const int LOG_DETAIL_LEN = 50;
+      static constexpr const int LOG_INDEX_LEN = 10;
 
-WARN_UNUSED
-really_inline bool str4ncmp(const uint8_t *src, const char* atom) {
-  uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
-  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING);
-  std::memcpy(&srcval, src, sizeof(uint32_t));
-  return srcval ^ string_to_uint32(atom);
-}
+      static int log_depth; // Not threadsafe. Log only.
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src) {
-  return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+      // Helper to turn unprintable or newline characters into spaces
+      static really_inline char printable_char(char c)
+      {
+        if (c >= 0x20)
+        {
+          return c;
+        }
+        else
+        {
+          return ' ';
+        }
+      }
 
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_true_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "true"); }
-  else { return false; }
-}
+      // Print the header and set up log_start
+      static really_inline void log_start()
+      {
+        if (LOG_ENABLED)
+        {
+          log_depth = 0;
+          printf("\n");
+          printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+          printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN + 2, DASHES, LOG_BUFFER_LEN + 2, DASHES, 4 + 2, DASHES, 4 + 2, DASHES, 5 + 2, DASHES, 5 + 2, DASHES, LOG_DETAIL_LEN + 2, DASHES, LOG_INDEX_LEN + 2, DASHES);
+        }
+      }
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src) {
-  return (str4ncmp(src+1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
-}
+      static really_inline void log_string(const char *message)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("%s\n", message);
+        }
+      }
 
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
-  if (len > 5) { return is_valid_false_atom(src); }
-  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
-  else { return false; }
-}
+      // Logs a single line of
+      template <typename S>
+      static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail)
+      {
+        if (LOG_ENABLED)
+        {
+          printf("| %*s%s%-*s ", log_depth * 2, "", title_prefix, LOG_EVENT_LEN - log_depth * 2 - int(strlen(title_prefix)), title);
+          {
+            // Print the next N characters in the buffer.
+            printf("| ");
+            // Otherwise, print the characters starting from the buffer position.
+            // Print spaces for unprintable or newline characters.
+            for (int i = 0; i < LOG_BUFFER_LEN; i++)
+            {
+              printf("%c", printable_char(structurals.current()[i]));
+            }
+            printf(" ");
+          }
+          printf("|    %c ", printable_char(structurals.current_char()));
+          printf("|    %c ", printable_char(structurals.peek_next_char()));
+          printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural + 1)]);
+          printf("| %5u ", structurals.next_tape_index());
+          printf("| %-*s ", LOG_DETAIL_LEN, detail);
+          printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+          printf("|\n");
+        }
+      }
+    } // namespace logger
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src) {
-  return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
-}
+    /* end file src/generic/stage2/logger.h */
+    /* begin file src/generic/stage2/atomparsing.h */
+    namespace stage2
+    {
+      namespace atomparsing
+      {
 
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
-  if (len > 4) { return is_valid_null_atom(src); }
-  else if (len == 4) { return !str4ncmp(src, "null"); }
-  else { return false; }
-}
+        really_inline uint32_t string_to_uint32(const char *str) { return *reinterpret_cast<const uint32_t *>(str); }
 
-} // namespace atomparsing
-/* end file src/generic/atomparsing.h */
-/* begin file src/generic/stage2_build_tape.h */
-// This file contains the common code every implementation uses for stage2
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
-// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
+        WARN_UNUSED
+        really_inline uint32_t str4ncmp(const uint8_t *src, const char *atom)
+        {
+          uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+          static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+          std::memcpy(&srcval, src, sizeof(uint32_t));
+          return srcval ^ string_to_uint32(atom);
+        }
 
-namespace stage2 {
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
 
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address;
-#define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
-#define GOTO(address) { goto *(address); }
-#define CONTINUE(address) { goto *(address); }
-#else
-typedef char ret_address;
-#define INIT_ADDRESSES() { '[', 'a', 'e', 'f', '{', 'o' };
-#define GOTO(address)                 \
-  {                                   \
-    switch(address) {                 \
-      case '[': goto array_begin;     \
-      case 'a': goto array_continue;  \
-      case 'e': goto error;           \
-      case 'f': goto finish;          \
-      case '{': goto object_begin;    \
-      case 'o': goto object_continue; \
-    }                                 \
-  }
-// For the more constrained end_xxx() situation
-#define CONTINUE(address)             \
-  {                                   \
-    switch(address) {                 \
-      case 'a': goto array_continue;  \
-      case 'o': goto object_continue; \
-      case 'f': goto finish;          \
-    }                                 \
-  }
-#endif
+        WARN_UNUSED
+        really_inline bool is_valid_true_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_true_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "true");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
-};
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src + 1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
+        }
 
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
+        WARN_UNUSED
+        really_inline bool is_valid_false_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 5)
+          {
+            return is_valid_false_atom(src);
+          }
+          else if (len == 5)
+          {
+            return !str4ncmp(src + 1, "alse");
+          }
+          else
+          {
+            return false;
+          }
+        }
 
-class structural_iterator {
-public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf}, len{_len}, structural_indexes{_structural_indexes}, next_structural{next_structural_index} {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src)
+        {
+          return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
+        }
+
+        WARN_UNUSED
+        really_inline bool is_valid_null_atom(const uint8_t *src, size_t len)
+        {
+          if (len > 4)
+          {
+            return is_valid_null_atom(src);
+          }
+          else if (len == 4)
+          {
+            return !str4ncmp(src, "null");
+          }
+          else
+          {
+            return false;
+          }
+        }
+
+      } // namespace atomparsing
+    }   // namespace stage2
+    /* end file src/generic/stage2/atomparsing.h */
+    /* begin file src/generic/stage2/structural_iterator.h */
+    namespace stage2
+    {
+
+      class structural_iterator
+      {
+      public:
+        const uint8_t *const buf;
+        uint32_t *current_structural;
+        dom_parser_implementation &parser;
+
+        // Start a structural
+        really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+            : buf{_parser.buf},
+              current_structural{&_parser.structural_indexes[start_structural_index]},
+              parser{_parser}
+        {
+        }
+        // Get the buffer position of the current structural character
+        really_inline const uint8_t *current()
+        {
+          return &buf[*current_structural];
+        }
+        // Get the current structural character
+        really_inline char current_char()
+        {
+          return buf[*current_structural];
+        }
+        // Get the next structural character without advancing
+        really_inline char peek_next_char()
+        {
+          return buf[*(current_structural + 1)];
+        }
+        really_inline char advance_char()
+        {
+          current_structural++;
+          return buf[*current_structural];
+        }
+        really_inline size_t remaining_len()
+        {
+          return parser.len - *current_structural;
+        }
+
+        really_inline bool past_end(uint32_t n_structural_indexes)
+        {
+          return current_structural >= &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_end(uint32_t n_structural_indexes)
+        {
+          return current_structural == &parser.structural_indexes[n_structural_indexes];
+        }
+        really_inline bool at_beginning()
+        {
+          return current_structural == parser.structural_indexes.get();
+        }
+      };
+
+    } // namespace stage2
+    /* end file src/generic/stage2/structural_iterator.h */
+    /* begin file src/generic/stage2/structural_parser.h */
+    // This file contains the common code every implementation uses for stage2
+    // It is intended to be included multiple times and compiled multiple times
+    // We assume the file in which it is include already includes
+    // "simdjson/stage2.h" (this simplifies amalgation)
+
+    namespace stage2
+    {
+      namespace
+      { // Make everything here private
+
+        /* begin file src/generic/stage2/tape_writer.h */
+        struct tape_writer
+        {
+          /** The next place to write to tape */
+          uint64_t *next_tape_loc;
+
+          /** Write a signed 64-bit value to tape. */
+          really_inline void append_s64(int64_t value) noexcept;
+
+          /** Write an unsigned 64-bit value to tape. */
+          really_inline void append_u64(uint64_t value) noexcept;
+
+          /** Write a double value to tape. */
+          really_inline void append_double(double value) noexcept;
+
+          /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+          really_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+          /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+          really_inline void skip() noexcept;
+
+          /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+          really_inline void skip_large_integer() noexcept;
+
+          /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+          really_inline void skip_double() noexcept;
+
+          /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+          really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+        private:
+          /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+          template <typename T>
+          really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+        }; // struct number_writer
+
+        really_inline void tape_writer::append_s64(int64_t value) noexcept
+        {
+          append2(0, value, internal::tape_type::INT64);
+        }
+
+        really_inline void tape_writer::append_u64(uint64_t value) noexcept
+        {
+          append(0, internal::tape_type::UINT64);
+          *next_tape_loc = value;
+          next_tape_loc++;
+        }
+
+        /** Write a double value to tape. */
+        really_inline void tape_writer::append_double(double value) noexcept
+        {
+          append2(0, value, internal::tape_type::DOUBLE);
+        }
+
+        really_inline void tape_writer::skip() noexcept
+        {
+          next_tape_loc++;
+        }
+
+        really_inline void tape_writer::skip_large_integer() noexcept
+        {
+          next_tape_loc += 2;
+        }
+
+        really_inline void tape_writer::skip_double() noexcept
+        {
+          next_tape_loc += 2;
+        }
+
+        really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept
+        {
+          *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+          next_tape_loc++;
+        }
+
+        template <typename T>
+        really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept
+        {
+          append(val, t);
+          static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+          memcpy(next_tape_loc, &val2, sizeof(val2));
+          next_tape_loc++;
+        }
+
+        really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept
+        {
+          tape_loc = val | ((uint64_t(char(t))) << 56);
+        }
+        /* end file src/generic/stage2/tape_writer.h */
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+#define INIT_ADDRESSES()                                                                  \
+  {                                                                                       \
+    &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue \
   }
-  really_inline char current_char() {
-    return c;
+#define GOTO(address) \
+  {                   \
+    goto *(address);  \
   }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
+#define CONTINUE(address) \
+  {                       \
+    goto *(address);      \
   }
-  really_inline size_t remaining_len() {
-    return len - idx;
+#else // SIMDJSON_USE_COMPUTED_GOTO
+#define INIT_ADDRESSES() {'[', 'a', 'e', 'f', '{', 'o'};
+#define GOTO(address)       \
+  {                         \
+    switch (address)        \
+    {                       \
+    case '[':               \
+      goto array_begin;     \
+    case 'a':               \
+      goto array_continue;  \
+    case 'e':               \
+      goto error;           \
+    case 'f':               \
+      goto finish;          \
+    case '{':               \
+      goto object_begin;    \
+    case 'o':               \
+      goto object_continue; \
+    }                       \
   }
-  template<typename F>
-  really_inline bool with_space_terminated_copy(const F& f) {
-    /**
-    * We need to make a copy to make sure that the string is space terminated.
-    * This is not about padding the input, which should already padded up
-    * to len + SIMDJSON_PADDING. However, we have no control at this stage
-    * on how the padding was done. What if the input string was padded with nulls?
-    * It is quite common for an input string to have an extra null character (C string).
-    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
-    * document, but the string "9\0" by itself is fine. So we make a copy and
-    * pad the input with spaces when we know that there is just one input element.
-    * This copy is relatively expensive, but it will almost never be called in
-    * practice unless you are in the strange scenario where you have many JSON
-    * documents made of single atoms.
-    */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if (copy == nullptr) {
-      return true;
-    }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
-    free(copy);
-    return result;
+// For the more constrained end_xxx() situation
+#define CONTINUE(address)   \
+  {                         \
+    switch (address)        \
+    {                       \
+    case 'a':               \
+      goto array_continue;  \
+    case 'o':               \
+      goto object_continue; \
+    case 'f':               \
+      goto finish;          \
+    }                       \
   }
-  really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
+#endif // SIMDJSON_USE_COMPUTED_GOTO
+
+        struct unified_machine_addresses
+        {
+          ret_address_t array_begin;
+          ret_address_t array_continue;
+          ret_address_t error;
+          ret_address_t finish;
+          ret_address_t object_begin;
+          ret_address_t object_continue;
+        };
+
+#undef FAIL_IF
+#define FAIL_IF(EXPR)         \
+  {                           \
+    if (EXPR)                 \
+    {                         \
+      return addresses.error; \
+    }                         \
   }
-  really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
-  }
-  really_inline size_t next_structural_index() {
-    return next_structural;
-  }
 
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx; // location of the structural character in the input (buf)
-  uint8_t c;  // used to track the (structural) character we are looking at
-};
+        struct structural_parser : structural_iterator
+        {
+          /** Lets you append to the tape */
+          tape_writer tape;
+          /** Next write location in the string buf for stage 2 parsing */
+          uint8_t *current_string_buf_loc;
+          /** Current depth (nested objects and arrays) */
+          uint32_t depth{0};
 
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
-  uint32_t depth;
+          // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+          really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+              : structural_iterator(_parser, start_structural_index),
+                tape{parser.doc->tape.get()},
+                current_string_buf_loc{parser.doc->string_buf.get()}
+          {
+          }
 
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+          WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state)
+          {
+            parser.containing_scope[depth].tape_index = next_tape_index();
+            parser.containing_scope[depth].count = 0;
+            tape.skip(); // We don't actually *write* the start element until the end.
+            parser.ret_address[depth] = continue_state;
+            depth++;
+            bool exceeded_max_depth = depth >= parser.max_depth();
+            if (exceeded_max_depth)
+            {
+              log_error("Exceeded max depth!");
+            }
+            return exceeded_max_depth;
+          }
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    doc_parser.on_start_document(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+          WARN_UNUSED really_inline bool start_document(ret_address_t continue_state)
+          {
+            log_start_value("document");
+            return start_scope(continue_state);
+          }
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    doc_parser.on_start_object(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+          WARN_UNUSED really_inline bool start_object(ret_address_t continue_state)
+          {
+            log_start_value("object");
+            return start_scope(continue_state);
+          }
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    doc_parser.on_start_array(depth);
-    doc_parser.ret_address[depth] = continue_state;
-    depth++;
-    return depth >= doc_parser.max_depth();
-  }
+          WARN_UNUSED really_inline bool start_array(ret_address_t continue_state)
+          {
+            log_start_value("array");
+            return start_scope(continue_state);
+          }
 
-  really_inline bool end_object() {
-    depth--;
-    doc_parser.on_end_object(depth);
-    return false;
-  }
-  really_inline bool end_array() {
-    depth--;
-    doc_parser.on_end_array(depth);
-    return false;
-  }
-  really_inline bool end_document() {
-    depth--;
-    doc_parser.on_end_document(depth);
-    return false;
-  }
+          // this function is responsible for annotating the start of the scope
+          really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept
+          {
+            depth--;
+            // write our doc->tape location to the header scope
+            // The root scope gets written *at* the previous location.
+            tape.append(parser.containing_scope[depth].tape_index, end);
+            // count can overflow if it exceeds 24 bits... so we saturate
+            // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+            const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+            const uint32_t count = parser.containing_scope[depth].count;
+            const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+            // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+            tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+          }
 
-  WARN_UNUSED really_inline bool parse_string() {
-    uint8_t *dst = doc_parser.on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
-    if (dst == nullptr) {
-      return true;
-    }
-    return !doc_parser.on_end_string(dst);
-  }
+          really_inline uint32_t next_tape_index()
+          {
+            return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+          }
 
-  WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    return !numberparsing::parse_number(src, found_minus, doc_parser);
-  }
-  WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
-  }
+          really_inline void end_object()
+          {
+            log_end_value("object");
+            end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+          }
+          really_inline void end_array()
+          {
+            log_end_value("array");
+            end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+          }
+          really_inline void end_document()
+          {
+            log_end_value("document");
+            end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+          }
 
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
+          // increment_count increments the count of keys in an object or values in an array.
+          // Note that if you are at the level of the values or elements, the count
+          // must be increment in the preceding depth (depth-1) where the array or
+          // the object resides.
+          really_inline void increment_count()
+          {
+            parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+          }
 
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_true_atom();
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_false_atom();
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        doc_parser.on_null_atom();
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
+          really_inline uint8_t *on_start_string() noexcept
+          {
+            // we advance the point, accounting for the fact that we have a NULL termination
+            tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+            return current_string_buf_loc + sizeof(uint32_t);
+          }
 
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
-    case '"':
-      FAIL_IF( parse_string() );
-      return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
-      return continue_state;
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      FAIL_IF( parse_number(false) );
-      return continue_state;
-    case '-':
-      FAIL_IF( parse_number(true) );
-      return continue_state;
-    case '{':
-      FAIL_IF( start_object(continue_state) );
-      return addresses.object_begin;
-    case '[':
-      FAIL_IF( start_array(continue_state) );
-      return addresses.array_begin;
-    default:
-      return addresses.error;
-    }
-  }
+          really_inline void on_end_string(uint8_t *dst) noexcept
+          {
+            uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+            // TODO check for overflow in case someone has a crazy string (>=4GB?)
+            // But only add the overflow check when the document itself exceeds 4GB
+            // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+            memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+            // NULL termination is still handy if you expect all your strings to
+            // be NULL terminated? It comes at a small cost
+            *dst = 0;
+            current_string_buf_loc = dst + 1;
+          }
 
-  WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
+          WARN_UNUSED really_inline bool parse_string(bool key = false)
+          {
+            log_value(key ? "key" : "string");
+            uint8_t *dst = on_start_string();
+            dst = stringparsing::parse_string(current(), dst);
+            if (dst == nullptr)
+            {
+              log_error("Invalid escape in string");
+              return true;
+            }
+            on_end_string(dst);
+            return false;
+          }
 
-    return doc_parser.on_success(SUCCESS);
-  }
+          WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus)
+          {
+            log_value("number");
+            bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+            if (!succeeded)
+            {
+              log_error("Invalid number");
+            }
+            return !succeeded;
+          }
+          WARN_UNUSED really_inline bool parse_number(bool found_minus)
+          {
+            return parse_number(current(), found_minus);
+          }
 
-  WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
+          really_inline bool parse_number_with_space_terminated_copy(const bool is_negative)
+          {
+            /**
+    * We need to make a copy to make sure that the string is space terminated.
+    * This is not about padding the input, which should already padded up
+    * to len + SIMDJSON_PADDING. However, we have no control at this stage
+    * on how the padding was done. What if the input string was padded with nulls?
+    * It is quite common for an input string to have an extra null character (C string).
+    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+    * document, but the string "9\0" by itself is fine. So we make a copy and
+    * pad the input with spaces when we know that there is just one input element.
+    * This copy is relatively expensive, but it will almost never be called in
+    * practice unless you are in the strange scenario where you have many JSON
+    * documents made of single atoms.
+    */
+            uint8_t *copy = static_cast<uint8_t *>(malloc(parser.len + SIMDJSON_PADDING));
+            if (copy == nullptr)
+            {
+              return true;
+            }
+            memcpy(copy, buf, parser.len);
+            memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+            size_t idx = *current_structural;
+            bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
+            free(copy);
+            return result;
+          }
+          WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state)
+          {
+            switch (advance_char())
+            {
+            case '"':
+              FAIL_IF(parse_string());
+              return continue_state;
+            case 't':
+              log_value("true");
+              FAIL_IF(!atomparsing::is_valid_true_atom(current()));
+              tape.append(0, internal::tape_type::TRUE_VALUE);
+              return continue_state;
+            case 'f':
+              log_value("false");
+              FAIL_IF(!atomparsing::is_valid_false_atom(current()));
+              tape.append(0, internal::tape_type::FALSE_VALUE);
+              return continue_state;
+            case 'n':
+              log_value("null");
+              FAIL_IF(!atomparsing::is_valid_null_atom(current()));
+              tape.append(0, internal::tape_type::NULL_VALUE);
+              return continue_state;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+              FAIL_IF(parse_number(false));
+              return continue_state;
+            case '-':
+              FAIL_IF(parse_number(true));
+              return continue_state;
+            case '{':
+              FAIL_IF(start_object(continue_state));
+              return addresses.object_begin;
+            case '[':
+              FAIL_IF(start_array(continue_state));
+              return addresses.array_begin;
+            default:
+              log_error("Non-value found when value was expected!");
+              return addresses.error;
+            }
+          }
+
+          WARN_UNUSED really_inline error_code finish()
+          {
+            end_document();
+            parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
+
+            if (depth != 0)
+            {
+              log_error("Unclosed objects or arrays!");
+              return parser.error = TAPE_ERROR;
+            }
+
+            return SUCCESS;
+          }
+
+          WARN_UNUSED really_inline error_code error()
+          {
+            /* We do not need the next line because this is done by parser.init_stage2(),
     * pessimistically.
-    * doc_parser.is_valid  = false;
+    * parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
     * location.
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    switch (structurals.current_char()) {
-    case '"':
-      return doc_parser.on_error(STRING_ERROR);
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-    case '-':
-      return doc_parser.on_error(NUMBER_ERROR);
-    case 't':
-      return doc_parser.on_error(T_ATOM_ERROR);
-    case 'n':
-      return doc_parser.on_error(N_ATOM_ERROR);
-    case 'f':
-      return doc_parser.on_error(F_ATOM_ERROR);
-    default:
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-  }
+            if (depth >= parser.max_depth())
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            switch (current_char())
+            {
+            case '"':
+              return parser.error = STRING_ERROR;
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '-':
+              return parser.error = NUMBER_ERROR;
+            case 't':
+              return parser.error = T_ATOM_ERROR;
+            case 'n':
+              return parser.error = N_ATOM_ERROR;
+            case 'f':
+              return parser.error = F_ATOM_ERROR;
+            default:
+              return parser.error = TAPE_ERROR;
+            }
+          }
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
-    }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_state)) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+          really_inline void init()
+          {
+            log_start();
+            parser.error = UNINITIALIZED;
+          }
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
-  }
-};
+          WARN_UNUSED really_inline error_code start(ret_address_t finish_state)
+          {
+            // If there are no structurals left, return EMPTY
+            if (at_end(parser.n_structural_indexes))
+            {
+              return parser.error = EMPTY;
+            }
 
-// Redefine FAIL_IF to use goto since it'll be used inside the function now
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
+            init();
+            // Push the root scope (there is always at least one scope)
+            if (start_document(finish_state))
+            {
+              return parser.error = DEPTH_ERROR;
+            }
+            return SUCCESS;
+          }
 
-} // namespace stage2
+          really_inline void log_value(const char *type)
+          {
+            logger::log_line(*this, "", type, "");
+          }
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
+          static really_inline void log_start()
+          {
+            logger::log_start();
+          }
 
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+          really_inline void log_start_value(const char *type)
+          {
+            logger::log_line(*this, "+", type, "");
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth++;
+            }
+          }
 
-//
-// Object parser states
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+          really_inline void log_end_value(const char *type)
+          {
+            if (logger::LOG_ENABLED)
+            {
+              logger::log_depth--;
+            }
+            logger::log_line(*this, "-", type, "");
+          }
 
-object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+          really_inline void log_error(const char *error)
+          {
+            logger::log_line(*this, "", "ERROR", error);
+          }
+        }; // struct structural_parser
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
+// Redefine FAIL_IF to use goto since it'll be used inside the function now
+#undef FAIL_IF
+#define FAIL_IF(EXPR) \
+  {                   \
+    if (EXPR)         \
+    {                 \
+      goto error;     \
+    }                 \
   }
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+        template <bool STREAMING>
+        WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept
+        {
+          dom_parser.doc = &doc;
+          static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+          stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+          error_code result = parser.start(addresses.finish);
+          if (result)
+          {
+            return result;
+          }
 
-//
-// Array parser states
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
+          //
+          // Read first value
+          //
+          switch (parser.current_char())
+          {
+          case '{':
+            FAIL_IF(parser.start_object(addresses.finish));
+            goto object_begin;
+          case '[':
+            FAIL_IF(parser.start_array(addresses.finish));
+            // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+            // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+            if (!STREAMING)
+            {
+              if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']')
+              {
+                goto error;
+              }
+            }
+            goto array_begin;
+          case '"':
+            FAIL_IF(parser.parse_string());
+            goto finish;
+          case 't':
+            parser.log_value("true");
+            FAIL_IF(!atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+            goto finish;
+          case 'f':
+            parser.log_value("false");
+            FAIL_IF(!atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+            goto finish;
+          case 'n':
+            parser.log_value("null");
+            FAIL_IF(!atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()));
+            parser.tape.append(0, internal::tape_type::NULL_VALUE);
+            goto finish;
+          case '0':
+          case '1':
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(false))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          case '-':
+            // Next line used to be an interesting functional programming exercise with
+            // a lambda that gets passed to another function via a closure. This would confuse the
+            // clangcl compiler under Visual Studio 2019 (recent release).
+            {
+              if (parser.parse_number_with_space_terminated_copy(true))
+              {
+                goto error;
+              }
+            }
+            goto finish;
+          default:
+            parser.log_error("Document starts with a non-value character");
+            goto error;
+          }
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+        //
+        // Object parser states
+        //
+        object_begin:
+          switch (parser.advance_char())
+          {
+          case '"':
+          {
+            parser.increment_count();
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          }
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("Object does not start with a key");
+            goto error;
+          }
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+        object_key_state:
+          if (parser.advance_char() != ':')
+          {
+            parser.log_error("Missing colon after key in object");
+            goto error;
+          }
+          GOTO(parser.parse_value(addresses, addresses.object_continue));
 
-finish:
-  return parser.finish();
+        object_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            if (parser.advance_char() != '"')
+            {
+              parser.log_error("Key string missing at beginning of field in object");
+              goto error;
+            }
+            FAIL_IF(parser.parse_string(true));
+            goto object_key_state;
+          case '}':
+            parser.end_object();
+            goto scope_end;
+          default:
+            parser.log_error("No comma between object fields");
+            goto error;
+          }
 
-error:
-  return parser.error();
-}
+        scope_end:
+          CONTINUE(parser.parser.ret_address[parser.depth]);
 
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
-  }
-  return code;
-}
-/* end file src/generic/stage2_build_tape.h */
-/* begin file src/generic/stage2_streaming_build_tape.h */
-namespace stage2 {
+        //
+        // Array parser states
+        //
+        array_begin:
+          if (parser.peek_next_char() == ']')
+          {
+            parser.advance_char();
+            parser.end_array();
+            goto scope_end;
+          }
+          parser.increment_count();
 
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, parser &_doc_parser, size_t _i) : structural_parser(_buf, _len, _doc_parser, _i) {}
+        main_array_switch:
+          /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+          GOTO(parser.parse_value(addresses, addresses.array_continue));
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    doc_parser.init_stage2(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return doc_parser.on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+        array_continue:
+          switch (parser.advance_char())
+          {
+          case ',':
+            parser.increment_count();
+            goto main_array_switch;
+          case ']':
+            parser.end_array();
+            goto scope_end;
+          default:
+            parser.log_error("Missing comma between array values");
+            goto error;
+          }
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope_offset[depth] != 0) {
-      return doc_parser.on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return doc_parser.on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
-  }
-};
+        finish:
+          return parser.finish();
 
-} // namespace stage2
+        error:
+          return parser.error();
+        }
 
-/************
+      } // namespace
+    }   // namespace stage2
+
+    /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, next_json);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+    WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept
+    {
+      error_code result = stage2::parse_structurals<false>(*this, _doc);
+      if (result)
+      {
+        return result;
+      }
 
-//
-// Object parser parsers
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+      // If we didn't make it to the end, it's an error
+      if (next_structural_index != n_structural_indexes)
+      {
+        logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+        return error = TAPE_ERROR;
+      }
 
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+      return SUCCESS;
+    }
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+    /************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+    WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept
+    {
+      return stage2::parse_structurals<true>(*this, _doc);
+    }
+    /* end file src/generic/stage2/tape_writer.h */
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+    WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept
+    {
+      error_code err = stage1(_buf, _len, false);
+      if (err)
+      {
+        return err;
+      }
+      return stage2(_doc);
+    }
 
-//
-// Array parser parsers
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
-
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
-
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
-
-finish:
-  next_json = parser.structurals.next_structural_index();
-  return parser.finish();
-
-error:
-  return parser.error();
-}
-/* end file src/generic/stage2_streaming_build_tape.h */
-
-} // namespace simdjson::westmere
+  } // namespace westmere
+} // namespace simdjson
 UNTARGET_REGION
-#endif // SIMDJSON_WESTMERE_STAGE2_BUILD_TAPE_H
-/* end file src/generic/stage2_streaming_build_tape.h */
+/* end file src/generic/stage2/tape_writer.h */
 #endif
-/* end file src/generic/stage2_streaming_build_tape.h */
-/* end file src/generic/stage2_streaming_build_tape.h */
+
+SIMDJSON_POP_DISABLE_WARNINGS
+/* end file src/generic/stage2/tape_writer.h */
\ No newline at end of file