math.h in nmatrix-0.0.3

- old
+ new

@@ -68,11 +68,14 @@
  * Standard Includes
  */
 
 extern "C" { // These need to be in an extern "C" block or you'll get all kinds of undefined symbol errors.
   #include <cblas.h>
-  //#include <clapack.h>
+
+  #ifdef HAVE_CLAPACK_H
+    #include <clapack.h>
+  #endif
 }
 
 #include <algorithm> // std::min, std::max
 #include <limits> // std::numeric_limits
 
@@ -83,21 +86,22 @@
 #include "lapack.h"
 
 /*
  * Macros
  */
+#define REAL_RECURSE_LIMIT 4
 
 /*
  * Data
  */
 
 
 extern "C" {
   /*
    * C accessors.
    */
-  void nm_math_det_exact(const int M, const void* elements, const int lda, dtype_t dtype, void* result);
+  void nm_math_det_exact(const int M, const void* elements, const int lda, nm::dtype_t dtype, void* result);
   void nm_math_transpose_generic(const size_t M, const size_t N, const void* A, const int lda, void* B, const int ldb, size_t element_size);
   void nm_math_init_blas(void);
 }
 
 
@@ -141,17 +145,23 @@
 /*
  * This version of trsm doesn't do any error checks and only works on column-major matrices.
  *
  * For row major, call trsm<DType> instead. That will handle necessary changes-of-variables
  * and parameter checks.
+ *
+ * Note that some of the boundary conditions here may be incorrect. Very little has been tested!
+ * This was converted directly from dtrsm.f using f2c, and then rewritten more cleanly.
  */
 template <typename DType>
 inline void trsm_nothrow(const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
                          const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
                          const int m, const int n, const DType alpha, const DType* a,
                          const int lda, DType* b, const int ldb)
 {
+
+  // (row-major) trsm: left upper trans nonunit m=3 n=1 1/1 a 3 b 3
+
   if (m == 0 || n == 0) return; /* Quick return if possible. */
 
   if (alpha == 0) { // Handle alpha == 0
     for (int j = 0; j < n; ++j) {
       for (int i = 0; i < m; ++i) {
@@ -208,11 +218,11 @@
       /*           Form  B := alpha*inv( A**T )*B. */
 	    if (uplo == CblasUpper) {
     		for (int j = 0; j < n; ++j) {
 		      for (int i = 0; i < m; ++i) {
 			      DType temp = alpha * b[i + j * ldb];
-            for (int k = 0; k < i-1; ++k) {
+            for (int k = 0; k < i; ++k) { // limit was i-1. Lots of similar bugs in this code, probably.
               temp -= a[k + i * lda] * b[k + j * ldb];
       			}
 			      if (diag == CblasNonUnit) {
 			        temp /= a[i + i * lda];
 			      }
@@ -337,20 +347,109 @@
 	  }
   }
 }
 
 
+template <typename DType>
+inline void syrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const DType* alpha, const DType* A, const int lda, const DType* beta, DType* C, const int ldc) {
+  rb_raise(rb_eNotImpError, "syrk not yet implemented for non-BLAS dtypes");
+}
+
+template <typename DType>
+inline void herk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const DType* alpha, const DType* A, const int lda, const DType* beta, DType* C, const int ldc) {
+  rb_raise(rb_eNotImpError, "herk not yet implemented for non-BLAS dtypes");
+}
+
+template <>
+inline void syrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const float* alpha, const float* A, const int lda, const float* beta, float* C, const int ldc) {
+  cblas_ssyrk(Order, Uplo, Trans, N, K, *alpha, A, lda, *beta, C, ldc);
+}
+
+template <>
+inline void syrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const double* alpha, const double* A, const int lda, const double* beta, double* C, const int ldc) {
+  cblas_dsyrk(Order, Uplo, Trans, N, K, *alpha, A, lda, *beta, C, ldc);
+}
+
+template <>
+inline void syrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const Complex64* alpha, const Complex64* A, const int lda, const Complex64* beta, Complex64* C, const int ldc) {
+  cblas_csyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+template <>
+inline void syrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const Complex128* alpha, const Complex128* A, const int lda, const Complex128* beta, Complex128* C, const int ldc) {
+  cblas_zsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+
+template <>
+inline void herk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const Complex64* alpha, const Complex64* A, const int lda, const Complex64* beta, Complex64* C, const int ldc) {
+  cblas_cherk(Order, Uplo, Trans, N, K, alpha->r, A, lda, beta->r, C, ldc);
+}
+
+template <>
+inline void herk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N,
+                 const int K, const Complex128* alpha, const Complex128* A, const int lda, const Complex128* beta, Complex128* C, const int ldc) {
+  cblas_zherk(Order, Uplo, Trans, N, K, alpha->r, A, lda, beta->r, C, ldc);
+}
+
+
+template <typename DType>
+inline void trmm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
+                 const enum CBLAS_TRANSPOSE ta, const enum CBLAS_DIAG diag, const int m, const int n, const DType* alpha,
+                 const DType* A, const int lda, DType* B, const int ldb) {
+  rb_raise(rb_eNotImpError, "trmm not yet implemented for non-BLAS dtypes");
+}
+
+template <>
+inline void trmm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
+                 const enum CBLAS_TRANSPOSE ta, const enum CBLAS_DIAG diag, const int m, const int n, const float* alpha,
+                 const float* A, const int lda, float* B, const int ldb) {
+  cblas_strmm(order, side, uplo, ta, diag, m, n, *alpha, A, lda, B, ldb);
+}
+
+template <>
+inline void trmm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
+                 const enum CBLAS_TRANSPOSE ta, const enum CBLAS_DIAG diag, const int m, const int n, const double* alpha,
+                 const double* A, const int lda, double* B, const int ldb) {
+  cblas_dtrmm(order, side, uplo, ta, diag, m, n, *alpha, A, lda, B, ldb);
+}
+
+template <>
+inline void trmm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
+                 const enum CBLAS_TRANSPOSE ta, const enum CBLAS_DIAG diag, const int m, const int n, const Complex64* alpha,
+                 const Complex64* A, const int lda, Complex64* B, const int ldb) {
+  cblas_ctrmm(order, side, uplo, ta, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+template <>
+inline void trmm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
+                 const enum CBLAS_TRANSPOSE ta, const enum CBLAS_DIAG diag, const int m, const int n, const Complex128* alpha,
+                 const Complex128* A, const int lda, Complex128* B, const int ldb) {
+  cblas_ztrmm(order, side, uplo, ta, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+
 /*
  * BLAS' DTRSM function, generalized.
  */
 template <typename DType, typename = typename std::enable_if<!std::is_integral<DType>::value>::type>
 inline void trsm(const enum CBLAS_ORDER order,
                  const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
                  const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
                  const int m, const int n, const DType alpha, const DType* a,
                  const int lda, DType* b, const int ldb)
 {
+  /*using std::cerr;
+  using std::endl;*/
+
   int                     num_rows_a = n;
   if (side == CblasLeft)  num_rows_a = m;
 
   if (lda < std::max(1,num_rows_a)) {
     fprintf(stderr, "TRSM: num_rows_a = %d; got lda=%d\n", num_rows_a, lda);
@@ -366,19 +465,32 @@
 
     // For row major, need to switch side and uplo
     enum CBLAS_SIDE side_ = side == CblasLeft  ? CblasRight : CblasLeft;
     enum CBLAS_UPLO uplo_ = uplo == CblasUpper ? CblasLower : CblasUpper;
 
+/*
+    cerr << "(row-major) trsm: " << (side_ == CblasLeft ? "left " : "right ")
+         << (uplo_ == CblasUpper ? "upper " : "lower ")
+         << (trans_a == CblasTrans ? "trans " : "notrans ")
+         << (diag == CblasNonUnit ? "nonunit " : "unit ")
+         << n << " " << m << " " << alpha << " a " << lda << " b " << ldb << endl;
+*/
     trsm_nothrow<DType>(side_, uplo_, trans_a, diag, n, m, alpha, a, lda, b, ldb);
 
   } else { // CblasColMajor
 
     if (ldb < std::max(1,m)) {
       fprintf(stderr, "TRSM: M=%d; got ldb=%d\n", m, ldb);
       rb_raise(rb_eArgError, "TRSM: Expected ldb >= max(1,M)");
     }
-
+/*
+    cerr << "(col-major) trsm: " << (side == CblasLeft ? "left " : "right ")
+         << (uplo == CblasUpper ? "upper " : "lower ")
+         << (trans_a == CblasTrans ? "trans " : "notrans ")
+         << (diag == CblasNonUnit ? "nonunit " : "unit ")
+         << m << " " << n << " " << alpha << " a " << lda << " b " << ldb << endl;
+*/
     trsm_nothrow<DType>(side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
 
   }
 
 }
@@ -388,50 +500,58 @@
 inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
                  const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
                  const int m, const int n, const float alpha, const float* a,
                  const int lda, float* b, const int ldb)
 {
-  cblas_strsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
+  cblas_strsm(order, side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 template <>
 inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
                  const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
                  const int m, const int n, const double alpha, const double* a,
                  const int lda, double* b, const int ldb)
 {
-  cblas_dtrsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
+/*  using std::cerr;
+  using std::endl;
+  cerr << "(row-major) dtrsm: " << (side == CblasLeft ? "left " : "right ")
+       << (uplo == CblasUpper ? "upper " : "lower ")
+       << (trans_a == CblasTrans ? "trans " : "notrans ")
+       << (diag == CblasNonUnit ? "nonunit " : "unit ")
+       << m << " " << n << " " << alpha << " a " << lda << " b " << ldb << endl;
+*/
+  cblas_dtrsm(order, side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 
 template <>
 inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
                  const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
                  const int m, const int n, const Complex64 alpha, const Complex64* a,
                  const int lda, Complex64* b, const int ldb)
 {
-  cblas_ctrsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, (const void*)(&alpha), (const void*)(a), lda, (void*)(b), ldb);
+  cblas_ctrsm(order, side, uplo, trans_a, diag, m, n, (const void*)(&alpha), (const void*)(a), lda, (void*)(b), ldb);
 }
 
 template <>
 inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
                  const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
                  const int m, const int n, const Complex128 alpha, const Complex128* a,
                  const int lda, Complex128* b, const int ldb)
 {
-  cblas_ztrsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, (const void*)(&alpha), (const void*)(a), lda, (void*)(b), ldb);
+  cblas_ztrsm(order, side, uplo, trans_a, diag, m, n, (const void*)(&alpha), (const void*)(a), lda, (void*)(b), ldb);
 }
 
 
 /*
  * ATLAS function which performs row interchanges on a general rectangular matrix. Modeled after the LAPACK LASWP function.
  *
  * This version is templated for use by template <> getrf().
  */
 template <typename DType>
 inline void laswp(const int N, DType* A, const int lda, const int K1, const int K2, const int *piv, const int inci) {
-  const int n = K2 - K1;
+  //const int n = K2 - K1; // not sure why this is declared. commented it out because it's unused.
 
   int nb = N >> 5;
 
   const int mr = N - (nb<<5);
   const int incA = lda << 5;
@@ -1259,12 +1379,93 @@
 
   }
   return(ierr);
 }
 
+/*
+ * Solves a system of linear equations A*X = B with a general NxN matrix A using the LU factorization computed by GETRF.
+ *
+ * From ATLAS 3.8.0.
+ */
+template <typename DType>
+int getrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, const int N, const int NRHS, const DType* A,
+           const int lda, const int* ipiv, DType* B, const int ldb)
+{
+  // enum CBLAS_DIAG Lunit, Uunit; // These aren't used. Not sure why they're declared in ATLAS' src.
 
+  if (!N || !NRHS) return 0;
+
+  const DType ONE = 1;
+
+  if (Order == CblasColMajor) {
+    if (Trans == CblasNoTrans) {
+      nm::math::laswp<DType>(NRHS, B, ldb, 0, N, ipiv, 1);
+      nm::math::trsm<DType>(Order, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, N, NRHS, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, NRHS, ONE, A, lda, B, ldb);
+    } else {
+      nm::math::trsm<DType>(Order, CblasLeft, CblasUpper, Trans, CblasNonUnit, N, NRHS, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasLeft, CblasLower, Trans, CblasUnit, N, NRHS, ONE, A, lda, B, ldb);
+      nm::math::laswp<DType>(NRHS, B, ldb, 0, N, ipiv, -1);
+    }
+  } else {
+    if (Trans == CblasNoTrans) {
+      nm::math::trsm<DType>(Order, CblasRight, CblasLower, CblasTrans, CblasNonUnit, NRHS, N, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasRight, CblasUpper, CblasTrans, CblasUnit, NRHS, N, ONE, A, lda, B, ldb);
+      nm::math::laswp<DType>(NRHS, B, ldb, 0, N, ipiv, -1);
+    } else {
+      nm::math::laswp<DType>(NRHS, B, ldb, 0, N, ipiv, 1);
+      nm::math::trsm<DType>(Order, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, NRHS, N, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, NRHS, N, ONE, A, lda, B, ldb);
+    }
+  }
+  return 0;
+}
+
+
 /*
+ * Solves a system of linear equations A*X = B with a symmetric positive definite matrix A using the Cholesky factorization computed by POTRF.
+ *
+ * From ATLAS 3.8.0.
+ */
+template <typename DType, bool is_complex>
+int potrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const int NRHS, const DType* A,
+           const int lda, DType* B, const int ldb)
+{
+  // enum CBLAS_DIAG Lunit, Uunit; // These aren't used. Not sure why they're declared in ATLAS' src.
+
+  CBLAS_TRANSPOSE MyTrans = is_complex ? CblasConjTrans : CblasTrans;
+
+  if (!N || !NRHS) return 0;
+
+  const DType ONE = 1;
+
+  if (Order == CblasColMajor) {
+    if (Uplo == CblasUpper) {
+      nm::math::trsm<DType>(Order, CblasLeft, CblasUpper, MyTrans, CblasNonUnit, N, NRHS, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, NRHS, ONE, A, lda, B, ldb);
+    } else {
+      nm::math::trsm<DType>(Order, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, N, NRHS, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasLeft, CblasLower, MyTrans, CblasNonUnit, N, NRHS, ONE, A, lda, B, ldb);
+    }
+  } else {
+    // There's some kind of scaling operation that normally happens here in ATLAS. Not sure what it does, so we'll only
+    // worry if something breaks. It probably has to do with their non-templated code and doesn't apply to us.
+
+    if (Uplo == CblasUpper) {
+      nm::math::trsm<DType>(Order, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, NRHS, N, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasRight, CblasUpper, MyTrans, CblasNonUnit, NRHS, N, ONE, A, lda, B, ldb);
+    } else {
+      nm::math::trsm<DType>(Order, CblasRight, CblasLower, MyTrans, CblasNonUnit, NRHS, N, ONE, A, lda, B, ldb);
+      nm::math::trsm<DType>(Order, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, NRHS, N, ONE, A, lda, B, ldb);
+    }
+  }
+  return 0;
+}
+
+
+
+/*
  * From ATLAS 3.8.0:
  *
  * Computes one of two LU factorizations based on the setting of the Order
  * parameter, as follows:
  * ----------------------------------------------------------------------------
@@ -1315,13 +1516,624 @@
     //rb_raise(rb_eNotImpError, "column major getrf not implemented");
   }
 }
 
 
+/*
+ * From ATLAS 3.8.0:
+ *
+ * Computes one of two LU factorizations based on the setting of the Order
+ * parameter, as follows:
+ * ----------------------------------------------------------------------------
+ *                       Order == CblasColMajor
+ * Column-major factorization of form
+ *   A = P * L * U
+ * where P is a row-permutation matrix, L is lower triangular with unit
+ * diagonal elements (lower trapazoidal if M > N), and U is upper triangular
+ * (upper trapazoidal if M < N).
+ *
+ * ----------------------------------------------------------------------------
+ *                       Order == CblasRowMajor
+ * Row-major factorization of form
+ *   A = P * L * U
+ * where P is a column-permutation matrix, L is lower triangular (lower
+ * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper
+ * trapazoidal if M < N).
+ *
+ * ============================================================================
+ * Let IERR be the return value of the function:
+ *    If IERR == 0, successful exit.
+ *    If (IERR < 0) the -IERR argument had an illegal value
+ *    If (IERR > 0 && Order == CblasColMajor)
+ *       U(i-1,i-1) is exactly zero.  The factorization has been completed,
+ *       but the factor U is exactly singular, and division by zero will
+ *       occur if it is used to solve a system of equations.
+ *    If (IERR > 0 && Order == CblasRowMajor)
+ *       L(i-1,i-1) is exactly zero.  The factorization has been completed,
+ *       but the factor L is exactly singular, and division by zero will
+ *       occur if it is used to solve a system of equations.
+ */
+template <typename DType>
+inline int potrf(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, DType* A, const int lda) {
+#ifdef HAVE_CLAPACK_H
+  rb_raise(rb_eNotImpError, "not yet implemented for non-BLAS dtypes");
+#else
+  rb_raise(rb_eNotImpError, "only LAPACK version implemented thus far");
+#endif
+  return 0;
+}
 
+#ifdef HAVE_CLAPACK_H
+template <>
+inline int potrf(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, float* A, const int lda) {
+  return clapack_spotrf(order, uplo, N, A, lda);
+}
 
+template <>
+inline int potrf(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, double* A, const int lda) {
+  return clapack_dpotrf(order, uplo, N, A, lda);
+}
+
+template <>
+inline int potrf(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, Complex64* A, const int lda) {
+  return clapack_cpotrf(order, uplo, N, reinterpret_cast<void*>(A), lda);
+}
+
+template <>
+inline int potrf(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, Complex128* A, const int lda) {
+  return clapack_zpotrf(order, uplo, N, reinterpret_cast<void*>(A), lda);
+}
+#endif
+
+
+// This is the old BLAS version of this function. ATLAS has an optimized version, but
+// it's going to be tough to translate.
+template <typename DType>
+static void swap(const int N, DType* X, const int incX, DType* Y, const int incY) {
+  if (N > 0) {
+    int ix = 0, iy = 0;
+    for (int i = 0; i < N; ++i) {
+      DType temp = X[i];
+      X[i]       = Y[i];
+      Y[i]       = temp;
+
+      ix += incX;
+      iy += incY;
+    }
+  }
+}
+
+
+// Copies an upper row-major array from U, zeroing U; U is unit, so diagonal is not copied.
+//
+// From ATLAS 3.8.0.
+template <typename DType>
+static inline void trcpzeroU(const int M, const int N, DType* U, const int ldu, DType* C, const int ldc) {
+
+  for (int i = 0; i != M; ++i) {
+    for (int j = i+1; j < N; ++j) {
+      C[j] = U[j];
+      U[j] = 0;
+    }
+
+    C += ldc;
+    U += ldu;
+  }
+}
+
+
 /*
+ * Un-comment the following lines when we figure out how to calculate NB for each of the ATLAS-derived
+ * functions. This is probably really complicated.
+ *
+ * Also needed: ATL_MulByNB, ATL_DivByNB (both defined in the build process for ATLAS), and ATL_mmMU.
+ *
+ */
+
+/*
+
+template <bool RowMajor, bool Upper, typename DType>
+static int trtri_4(const enum CBLAS_DIAG Diag, DType* A, const int lda) {
+
+  if (RowMajor) {
+    DType *pA0 = A, *pA1 = A+lda, *pA2 = A+2*lda, *pA3 = A+3*lda;
+    DType tmp;
+    if (Upper) {
+      DType A01 = pA0[1], A02 = pA0[2], A03 = pA0[3],
+                          A12 = pA1[2], A13 = pA1[3],
+                                        A23 = pA2[3];
+
+      if (Diag == CblasNonUnit) {
+        pA0->inverse();
+        (pA1+1)->inverse();
+        (pA2+2)->inverse();
+        (pA3+3)->inverse();
+
+        pA0[1] = -A01 * pA1[1] * pA0[0];
+        pA1[2] = -A12 * pA2[2] * pA1[1];
+        pA2[3] = -A23 * pA3[3] * pA2[2];
+
+        pA0[2] = -(A01 * pA1[2] + A02 * pA2[2]) * pA0[0];
+        pA1[3] = -(A12 * pA2[3] + A13 * pA3[3]) * pA1[1];
+
+        pA0[3] = -(A01 * pA1[3] + A02 * pA2[3] + A03 * pA3[3]) * pA0[0];
+
+      } else {
+
+        pA0[1] = -A01;
+        pA1[2] = -A12;
+        pA2[3] = -A23;
+
+        pA0[2] = -(A01 * pA1[2] + A02);
+        pA1[3] = -(A12 * pA2[3] + A13);
+
+        pA0[3] = -(A01 * pA1[3] + A02 * pA2[3] + A03);
+      }
+
+    } else { // Lower
+      DType A10 = pA1[0],
+            A20 = pA2[0], A21 = pA2[1],
+            A30 = PA3[0], A31 = pA3[1], A32 = pA3[2];
+      DType *B10 = pA1,
+            *B20 = pA2,
+            *B30 = pA3,
+            *B21 = pA2+1,
+            *B31 = pA3+1,
+            *B32 = pA3+2;
+
+
+      if (Diag == CblasNonUnit) {
+        pA0->inverse();
+        (pA1+1)->inverse();
+        (pA2+2)->inverse();
+        (pA3+3)->inverse();
+
+        *B10 = -A10 * pA0[0] * pA1[1];
+        *B21 = -A21 * pA1[1] * pA2[2];
+        *B32 = -A32 * pA2[2] * pA3[3];
+        *B20 = -(A20 * pA0[0] + A21 * (*B10)) * pA2[2];
+        *B31 = -(A31 * pA1[1] + A32 * (*B21)) * pA3[3];
+        *B30 = -(A30 * pA0[0] + A31 * (*B10) + A32 * (*B20)) * pA3;
+      } else {
+        *B10 = -A10;
+        *B21 = -A21;
+        *B32 = -A32;
+        *B20 = -(A20 + A21 * (*B10));
+        *B31 = -(A31 + A32 * (*B21));
+        *B30 = -(A30 + A31 * (*B10) + A32 * (*B20));
+      }
+    }
+
+  } else {
+    rb_raise(rb_eNotImpError, "only row-major implemented at this time");
+  }
+
+  return 0;
+
+}
+
+
+template <bool RowMajor, bool Upper, typename DType>
+static int trtri_3(const enum CBLAS_DIAG Diag, DType* A, const int lda) {
+
+  if (RowMajor) {
+
+    DType tmp;
+
+    if (Upper) {
+      DType A01 = pA0[1], A02 = pA0[2], A03 = pA0[3],
+                          A12 = pA1[2], A13 = pA1[3];
+
+      DType *B01 = pA0 + 1,
+            *B02 = pA0 + 2,
+            *B12 = pA1 + 2;
+
+      if (Diag == CblasNonUnit) {
+        pA0->inverse();
+        (pA1+1)->inverse();
+        (pA2+2)->inverse();
+
+        *B01 = -A01 * pA1[1] * pA0[0];
+        *B12 = -A12 * pA2[2] * pA1[1];
+        *B02 = -(A01 * (*B12) + A02 * pA2[2]) * pA0[0];
+      } else {
+        *B01 = -A01;
+        *B12 = -A12;
+        *B02 = -(A01 * (*B12) + A02);
+      }
+
+    } else { // Lower
+      DType *pA0=A, *pA1=A+lda, *pA2=A+2*lda;
+      DType A10=pA1[0],
+            A20=pA2[0], A21=pA2[1];
+
+      DType *B10 = pA1,
+            *B20 = pA2;
+            *B21 = pA2+1;
+
+      if (Diag == CblasNonUnit) {
+        pA0->inverse();
+        (pA1+1)->inverse();
+        (pA2+2)->inverse();
+        *B10 = -A10 * pA0[0] * pA1[1];
+        *B21 = -A21 * pA1[1] * pA2[2];
+        *B20 = -(A20 * pA0[0] + A21 * (*B10)) * pA2[2];
+      } else {
+        *B10 = -A10;
+        *B21 = -A21;
+        *B20 = -(A20 + A21 * (*B10));
+      }
+    }
+
+
+  } else {
+    rb_raise(rb_eNotImpError, "only row-major implemented at this time");
+  }
+
+  return 0;
+
+}
+
+template <bool RowMajor, bool Upper, bool Real, typename DType>
+static void trtri(const enum CBLAS_DIAG Diag, const int N, DType* A, const int lda) {
+  DType *Age, *Atr;
+  DType tmp;
+  int Nleft, Nright;
+
+  int ierr = 0;
+
+  static const DType ONE = 1;
+  static const DType MONE -1;
+  static const DType NONE = -1;
+
+  if (RowMajor) {
+
+    // FIXME: Use REAL_RECURSE_LIMIT here for float32 and float64 (instead of 1)
+    if ((Real && N > REAL_RECURSE_LIMIT) || (N > 1)) {
+      Nleft = N >> 1;
+#ifdef NB
+      if (Nleft > NB) NLeft = ATL_MulByNB(ATL_DivByNB(Nleft));
+#endif
+
+      Nright = N - Nleft;
+
+      if (Upper) {
+        Age = A + Nleft;
+        Atr = A + (Nleft * (lda+1));
+
+        nm::math::trsm<DType>(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, Diag,
+                              Nleft, Nright, ONE, Atr, lda, Age, lda);
+
+        nm::math::trsm<DType>(CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, Diag,
+                              Nleft, Nright, MONE, A, lda, Age, lda);
+
+      } else { // Lower
+        Age = A + ((Nleft*lda));
+        Atr = A + (Nleft * (lda+1));
+
+        nm::math::trsm<DType>(CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, Diag,
+                              Nright, Nleft, ONE, A, lda, Age, lda);
+        nm::math::trsm<DType>(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, Diag,
+                              Nright, Nleft, MONE, Atr, lda, Age, lda);
+      }
+
+      ierr = trtri<RowMajor,Upper,Real,DType>(Diag, Nleft, A, lda);
+      if (ierr) return ierr;
+
+      ierr = trtri<RowMajor,Upper,Real,DType>(Diag, Nright, Atr, lda);
+      if (ierr) return ierr + Nleft;
+
+    } else {
+      if (Real) {
+        if (N == 4) {
+          return trtri_4<RowMajor,Upper,Real,DType>(Diag, A, lda);
+        } else if (N == 3) {
+          return trtri_3<RowMajor,Upper,Real,DType>(Diag, A, lda);
+        } else if (N == 2) {
+          if (Diag == CblasNonUnit) {
+            A->inverse();
+            (A+(lda+1))->inverse();
+
+            if (Upper) {
+              *(A+1)     *=   *A;         // TRI_MUL
+              *(A+1)     *=   *(A+lda+1); // TRI_MUL
+            } else {
+              *(A+lda)   *=   *A;         // TRI_MUL
+              *(A+lda)   *=   *(A+lda+1); // TRI_MUL
+            }
+          }
+
+          if (Upper) *(A+1)   = -*(A+1);      // TRI_NEG
+          else       *(A+lda) = -*(A+lda);    // TRI_NEG
+        } else if (Diag == CblasNonUnit) A->inverse();
+      } else { // not real
+        if (Diag == CblasNonUnit) A->inverse();
+      }
+    }
+
+  } else {
+    rb_raise(rb_eNotImpError, "only row-major implemented at this time");
+  }
+
+  return ierr;
+}
+
+
+template <bool RowMajor, bool Real, typename DType>
+int getri(const int N, DType* A, const int lda, const int* ipiv, DType* wrk, const int lwrk) {
+
+  if (!RowMajor) rb_raise(rb_eNotImpError, "only row-major implemented at this time");
+
+  int jb, nb, I, ndown, iret;
+
+  const DType ONE = 1, NONE = -1;
+
+  int iret = trtri<RowMajor,false,Real,DType>(CblasNonUnit, N, A, lda);
+  if (!iret && N > 1) {
+    jb = lwrk / N;
+    if (jb >= NB) nb = ATL_MulByNB(ATL_DivByNB(jb));
+    else if (jb >= ATL_mmMU) nb = (jb/ATL_mmMU)*ATL_mmMU;
+    else nb = jb;
+    if (!nb) return -6; // need at least 1 row of workspace
+
+    // only first iteration will have partial block, unroll it
+
+    jb = N - (N/nb) * nb;
+    if (!jb) jb = nb;
+    I = N - jb;
+    A += lda * I;
+    trcpzeroU<DType>(jb, jb, A+I, lda, wrk, jb);
+    nm::math::trsm<DType>(CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasUnit,
+                          jb, N, ONE, wrk, jb, A, lda);
+
+    if (I) {
+      do {
+        I -= nb;
+        A -= nb * lda;
+        ndown = N-I;
+        trcpzeroU<DType>(nb, ndown, A+I, lda, wrk, ndown);
+        nm::math::gemm<DType>(CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasUnit,
+                              nb, N, ONE, wrk, ndown, A, lda);
+      } while (I);
+    }
+
+    // Apply row interchanges
+
+    for (I = N - 2; I >= 0; --I) {
+      jb = ipiv[I];
+      if (jb != I) nm::math::swap<DType>(N, A+I*lda, 1, A+jb*lda, 1);
+    }
+  }
+
+  return iret;
+}
+*/
+
+
+// TODO: Test this to see if it works properly on complex. ATLAS has a separate algorithm for complex, which looks like
+// TODO: it may actually be the same one.
+//
+// This function is called ATL_rot in ATLAS 3.8.4.
+template <typename DType>
+inline void rot_helper(const int N, DType* X, const int incX, DType* Y, const int incY, const DType c, const DType s) {
+  if (c != 1 || s != 0) {
+    if (incX == 1 && incY == 1) {
+      for (int i = 0; i != N; ++i) {
+        DType tmp = X[i] * c + Y[i] * s;
+        Y[i]      = Y[i] * c - X[i] * s;
+        X[i]      = tmp;
+      }
+    } else {
+      for (int i = N; i > 0; --i, Y += incY, X += incX) {
+        DType tmp = *X * c + *Y * s;
+        *Y  = *Y * c - *X * s;
+        *X  = tmp;
+      }
+    }
+  }
+}
+
+
+/* Givens plane rotation. From ATLAS 3.8.4. */
+// FIXME: Need a specialized algorithm for Rationals. BLAS' algorithm simply will not work for most values due to the
+// FIXME: sqrt.
+template <typename DType>
+inline void rotg(DType* a, DType* b, DType* c, DType* s) {
+  DType aa    = std::abs(*a), ab = std::abs(*b);
+  DType roe   = aa > ab ? *a : *b;
+  DType scal  = aa + ab;
+
+  if (scal == 0) {
+    *c =  1;
+    *s = *a = *b = 0;
+  } else {
+    DType t0  = aa / scal, t1 = ab / scal;
+    DType r   = scal * std::sqrt(t0 * t0 + t1 * t1);
+    if (roe < 0) r = -r;
+    *c = *a / r;
+    *s = *b / r;
+    DType z   = (*c != 0) ? (1 / *c) : DType(1);
+    *a = r;
+    *b = z;
+  }
+}
+
+template <>
+inline void rotg(float* a, float* b, float* c, float* s) {
+  cblas_srotg(a, b, c, s);
+}
+
+template <>
+inline void rotg(double* a, double* b, double* c, double* s) {
+  cblas_drotg(a, b, c, s);
+}
+
+template <>
+inline void rotg(Complex64* a, Complex64* b, Complex64* c, Complex64* s) {
+  cblas_crotg(reinterpret_cast<void*>(a), reinterpret_cast<void*>(b), reinterpret_cast<void*>(c), reinterpret_cast<void*>(s));
+}
+
+template <>
+inline void rotg(Complex128* a, Complex128* b, Complex128* c, Complex128* s) {
+  cblas_zrotg(reinterpret_cast<void*>(a), reinterpret_cast<void*>(b), reinterpret_cast<void*>(c), reinterpret_cast<void*>(s));
+}
+
+template <typename DType>
+inline void cblas_rotg(void* a, void* b, void* c, void* s) {
+  rotg<DType>(reinterpret_cast<DType*>(a), reinterpret_cast<DType*>(b), reinterpret_cast<DType*>(c), reinterpret_cast<DType*>(s));
+}
+
+
+/* Applies a plane rotation. From ATLAS 3.8.4. */
+template <typename DType, typename CSDType>
+inline void rot(const int N, DType* X, const int incX, DType* Y, const int incY, const CSDType c, const CSDType s) {
+  DType *x = X, *y = Y;
+  int incx = incX, incy = incY;
+
+  if (N > 0) {
+    if (incX < 0) {
+      if (incY < 0) { incx = -incx; incy = -incy; }
+      else x += -incX * (N-1);
+    } else if (incY < 0) {
+      incy = -incy;
+      incx = -incx;
+      x += (N-1) * incX;
+    }
+    rot_helper<DType>(N, x, incx, y, incy, c, s);
+  }
+}
+
+template <>
+inline void rot(const int N, float* X, const int incX, float* Y, const int incY, const float c, const float s) {
+  cblas_srot(N, X, incX, Y, incY, (float)c, (float)s);
+}
+
+template <>
+inline void rot(const int N, double* X, const int incX, double* Y, const int incY, const double c, const double s) {
+  cblas_drot(N, X, incX, Y, incY, c, s);
+}
+
+template <>
+inline void rot(const int N, Complex64* X, const int incX, Complex64* Y, const int incY, const float c, const float s) {
+  cblas_csrot(N, X, incX, Y, incY, c, s);
+}
+
+template <>
+inline void rot(const int N, Complex128* X, const int incX, Complex128* Y, const int incY, const double c, const double s) {
+  cblas_zdrot(N, X, incX, Y, incY, c, s);
+}
+
+
+template <typename DType, typename CSDType>
+inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
+  rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
+}
+
+
+template <bool is_complex, typename DType>
+inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, DType* A, const int lda) {
+
+  int Nleft, Nright;
+  const DType ONE = 1;
+  DType *G, *U0 = A, *U1;
+
+  if (N > 1) {
+    Nleft = N >> 1;
+    #ifdef NB
+      if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
+    #endif
+
+    Nright = N - Nleft;
+
+    // FIXME: There's a simpler way to write this next block, but I'm way too tired to work it out right now.
+    if (uplo == CblasUpper) {
+      if (order == CblasRowMajor) {
+        G = A + Nleft;
+        U1 = G + Nleft * lda;
+      } else {
+        G = A + Nleft * lda;
+        U1 = G + Nleft;
+      }
+    } else {
+      if (order == CblasRowMajor) {
+        G = A + Nleft * lda;
+        U1 = G + Nleft;
+      } else {
+        G = A + Nleft;
+        U1 = G + Nleft * lda;
+      }
+    }
+
+    lauum<is_complex, DType>(order, uplo, Nleft, U0, lda);
+
+    if (is_complex) {
+
+      nm::math::herk<DType>(order, uplo,
+                            uplo == CblasLower ? CblasConjTrans : CblasNoTrans,
+                            Nleft, Nright, &ONE, G, lda, &ONE, U0, lda);
+
+      nm::math::trmm<DType>(order, CblasLeft, uplo, CblasConjTrans, CblasNonUnit, Nright, Nleft, &ONE, U1, lda, G, lda);
+    } else {
+      nm::math::syrk<DType>(order, uplo,
+                            uplo == CblasLower ? CblasTrans : CblasNoTrans,
+                            Nleft, Nright, &ONE, G, lda, &ONE, U0, lda);
+
+      nm::math::trmm<DType>(order, CblasLeft, uplo, CblasTrans, CblasNonUnit, Nright, Nleft, &ONE, U1, lda, G, lda);
+    }
+    lauum<is_complex, DType>(order, uplo, Nright, U1, lda);
+
+  } else {
+    *A = *A * *A;
+  }
+}
+
+
+#ifdef HAVE_CLAPACK_H
+template <bool is_complex>
+inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, float* A, const int lda) {
+  clapack_slauum(order, uplo, N, A, lda);
+}
+
+template <bool is_complex>
+inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, double* A, const int lda) {
+  clapack_dlauum(order, uplo, N, A, lda);
+}
+
+template <bool is_complex>
+inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, Complex64* A, const int lda) {
+  clapack_clauum(order, uplo, N, A, lda);
+}
+
+template <bool is_complex>
+inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, Complex128* A, const int lda) {
+  clapack_zlauum(order, uplo, N, A, lda);
+}
+#endif
+
+
+/*
+* Function signature conversion for calling LAPACK's lauum functions as directly as possible.
+*
+* For documentation: http://www.netlib.org/lapack/double/dlauum.f
+*
+* This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
+*/
+template <bool is_complex, typename DType>
+inline int clapack_lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, void* a, const int lda) {
+  if (n < 0)              rb_raise(rb_eArgError, "n cannot be less than zero, is set to %d", n);
+  if (lda < n || lda < 1) rb_raise(rb_eArgError, "lda must be >= max(n,1); lda=%d, n=%d\n", lda, n);
+
+  if (uplo == CblasUpper) lauum<is_complex, DType>(order, uplo, n, reinterpret_cast<DType*>(a), lda);
+  else                    lauum<is_complex, DType>(order, uplo, n, reinterpret_cast<DType*>(a), lda);
+
+  return 0;
+}
+
+
+
+
+/*
  * Macro for declaring LAPACK specializations of the getrf function.
  *
  * type is the DType; call is the specific function to call; cast_as is what the DType* should be
  * cast to in order to pass it to LAPACK.
  */
@@ -1353,9 +2165,146 @@
 */
 template <typename DType>
 inline int clapack_getrf(const enum CBLAS_ORDER order, const int m, const int n, void* a, const int lda, int* ipiv) {
   return getrf<DType>(order, m, n, reinterpret_cast<DType*>(a), lda, ipiv);
 }
+
+
+/*
+* Function signature conversion for calling LAPACK's potrf functions as directly as possible.
+*
+* For documentation: http://www.netlib.org/lapack/double/dpotrf.f
+*
+* This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
+*/
+template <typename DType>
+inline int clapack_potrf(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, void* a, const int lda) {
+  return potrf<DType>(order, uplo, n, reinterpret_cast<DType*>(a), lda);
+}
+
+
+/*
+* Function signature conversion for calling LAPACK's getrs functions as directly as possible.
+*
+* For documentation: http://www.netlib.org/lapack/double/dgetrs.f
+*
+* This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
+*/
+template <typename DType>
+inline int clapack_getrs(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const int n, const int nrhs,
+                         const void* a, const int lda, const int* ipiv, void* b, const int ldb) {
+  return getrs<DType>(order, trans, n, nrhs, reinterpret_cast<const DType*>(a), lda, ipiv, reinterpret_cast<DType*>(b), ldb);
+}
+
+/*
+* Function signature conversion for calling LAPACK's potrs functions as directly as possible.
+*
+* For documentation: http://www.netlib.org/lapack/double/dpotrs.f
+*
+* This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
+*/
+template <typename DType, bool is_complex>
+inline int clapack_potrs(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, const int nrhs,
+                         const void* a, const int lda, void* b, const int ldb) {
+  return potrs<DType,is_complex>(order, uplo, n, nrhs, reinterpret_cast<const DType*>(a), lda, reinterpret_cast<DType*>(b), ldb);
+}
+
+template <typename DType>
+inline int getri(const enum CBLAS_ORDER order, const int n, DType* a, const int lda, const int* ipiv) {
+  rb_raise(rb_eNotImpError, "getri not yet implemented for non-BLAS dtypes");
+  return 0;
+}
+
+#ifdef HAVE_CLAPACK_H
+template <>
+inline int getri(const enum CBLAS_ORDER order, const int n, float* a, const int lda, const int* ipiv) {
+  return clapack_sgetri(order, n, a, lda, ipiv);
+}
+
+template <>
+inline int getri(const enum CBLAS_ORDER order, const int n, double* a, const int lda, const int* ipiv) {
+  return clapack_dgetri(order, n, a, lda, ipiv);
+}
+
+template <>
+inline int getri(const enum CBLAS_ORDER order, const int n, Complex64* a, const int lda, const int* ipiv) {
+  return clapack_cgetri(order, n, reinterpret_cast<void*>(a), lda, ipiv);
+}
+
+template <>
+inline int getri(const enum CBLAS_ORDER order, const int n, Complex128* a, const int lda, const int* ipiv) {
+  return clapack_zgetri(order, n, reinterpret_cast<void*>(a), lda, ipiv);
+}
+#endif
+
+
+template <typename DType>
+inline int potri(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, DType* a, const int lda) {
+  rb_raise(rb_eNotImpError, "potri not yet implemented for non-BLAS dtypes");
+  return 0;
+}
+
+
+#ifdef HAVE_CLAPACK_H
+template <>
+inline int potri(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, float* a, const int lda) {
+  return clapack_spotri(order, uplo, n, a, lda);
+}
+
+template <>
+inline int potri(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, double* a, const int lda) {
+  return clapack_dpotri(order, uplo, n, a, lda);
+}
+
+template <>
+inline int potri(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, Complex64* a, const int lda) {
+  return clapack_cpotri(order, uplo, n, reinterpret_cast<void*>(a), lda);
+}
+
+template <>
+inline int potri(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, Complex128* a, const int lda) {
+  return clapack_zpotri(order, uplo, n, reinterpret_cast<void*>(a), lda);
+}
+#endif
+
+/*
+ * Function signature conversion for calling LAPACK's getri functions as directly as possible.
+ *
+ * For documentation: http://www.netlib.org/lapack/double/dgetri.f
+ *
+ * This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
+ */
+template <typename DType>
+inline int clapack_getri(const enum CBLAS_ORDER order, const int n, void* a, const int lda, const int* ipiv) {
+  return getri<DType>(order, n, reinterpret_cast<DType*>(a), lda, ipiv);
+}
+
+
+/*
+ * Function signature conversion for calling LAPACK's potri functions as directly as possible.
+ *
+ * For documentation: http://www.netlib.org/lapack/double/dpotri.f
+ *
+ * This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
+ */
+template <typename DType>
+inline int clapack_potri(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int n, void* a, const int lda) {
+  return potri<DType>(order, uplo, n, reinterpret_cast<DType*>(a), lda);
+}
+
+
+/*
+* Function signature conversion for calling LAPACK's laswp functions as directly as possible.
+*
+* For documentation: http://www.netlib.org/lapack/double/dlaswp.f
+*
+* This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
+*/
+template <typename DType>
+inline void clapack_laswp(const int n, void* a, const int lda, const int k1, const int k2, const int* ipiv, const int incx) {
+  laswp<DType>(n, reinterpret_cast<DType*>(a), lda, k1, k2, ipiv, incx);
+}
+
 
 
 }} // end namespace nm::math