math.h in nmatrix-0.0.5

- old
+ new

@@ -1024,51 +1024,49 @@
 }
 
 
 // Yale: numeric matrix multiply c=a*b
 template <typename DType, typename IType>
-inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
+inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga,
             const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
-  IType next[m];
-  DType sums[m];
+  const unsigned int max_lmn = std::max(std::max(m, n), l);
+  IType next[max_lmn];
+  DType sums[max_lmn];
 
   DType v;
 
   IType head, length, temp, ndnz = 0;
-  IType jj_start, jj_end, kk_start, kk_end;
-  IType i, j, k, kk, jj;
   IType minmn = std::min(m,n);
+  IType minlm = std::min(l,m);
 
-  for (i = 0; i < m; ++i) { // initialize scratch arrays
-    next[i] = std::numeric_limits<IType>::max();
-    sums[i] = 0;
+  for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays
+    next[idx] = std::numeric_limits<IType>::max();
+    sums[idx] = 0;
   }
 
-  for (i = 0; i < n; ++i) { // walk down the rows
+  for (IType i = 0; i < n; ++i) { // walk down the rows
     head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
     length = 0;
 
-    jj_start = ia[i];
-    jj_end   = ia[i+1];
+    for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row
+      IType j;
 
-    for (jj = jj_start; jj <= jj_end; ++jj) { // walk through entries in each row
-
-      if (jj == jj_end) { // if we're in the last entry for this row:
+      if (jj == ia[i+1]) { // if we're in the last entry for this row:
         if (!diaga || i >= minmn) continue;
         j   = i;      // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
         v   = a[i];
       } else {
         j   = ja[jj]; // if it's not the last entry for this row, get the column (j) and entry (ajj)
         v   = a[jj];
       }
 
-      kk_start = ib[j];   // Find the first entry of row j of matrix B
-      kk_end   = ib[j+1];
-      for (kk = kk_start; kk <= kk_end; ++kk) {
+      for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) {
 
-        if (kk == kk_end) { // Get the column id for that entry
-          if (!diagb || j >= minmn) continue;
+        IType k;
+
+        if (kk == ib[j+1]) { // Get the column id for that entry
+          if (!diagb || j >= minlm) continue;
           k  = j;
           sums[k] += v*b[k];
         } else {
           k  = jb[kk];
           sums[k] += v*b[kk];
@@ -1077,14 +1075,14 @@
         if (next[k] == std::numeric_limits<IType>::max()) {
           next[k] = head;
           head    = k;
           ++length;
         }
-      }
-    }
+      } // end of kk loop
+    } // end of jj loop
 
-    for (jj = 0; jj < length; ++jj) {
+    for (IType jj = 0; jj < length; ++jj) {
       if (sums[head] != 0) {
         if (diagc && head == i) {
           c[head] = sums[head];
         } else {
           jc[n+1+ndnz] = head;
@@ -1103,26 +1101,68 @@
     ic[i+1] = n+1+ndnz;
   }
 } /* numbmm_ */
 
 
+/*
+template <typename DType, typename IType>
+inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) {
+  unsigned int n = c_storage->shape[0],
+               l = c_storage->shape[1];
 
+  // Create a working vector of dimension max(m,l,n) and initial value IType::max():
+  std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max());
+
+  for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i|
+
+    IType j, k;
+    size_t ndnz;
+
+    for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A
+      j = (jj == ija[i+1]) ? i : ija[jj];   // Get the current column index (handle diagonals last)
+
+      if (j >= m) {
+        if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j);
+        else              break;
+      }
+
+      for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B
+        if (j >= m) continue; // first of all, does B *have* a row j?
+        k = (kk == ijb[j+1]) ? j : ijb[kk];   // Get the current column index (handle diagonals last)
+
+        if (k >= l) {
+          if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k);
+          else              break;
+        }
+
+        if (mask[k] == )
+      }
+
+    }
+  }
+}
+*/
+
 // Yale: Symbolic matrix multiply c=a*b
 template <typename IType>
-inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const bool diaga,
+inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga,
             const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
-  IType mask[m];
-  IType j, k, ndnz = n; /* Local variables */
+  unsigned int max_lmn = std::max(std::max(m,n), l);
+  IType mask[max_lmn];  // INDEX in the SMMP paper.
+  IType j, k; /* Local variables */
+  size_t ndnz = n;
 
+  for (IType idx = 0; idx < max_lmn; ++idx)
+    mask[idx] = std::numeric_limits<IType>::max();
 
-  for (j = 0; j < m; ++j)
-    mask[j] = std::numeric_limits<IType>::max();
+  if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting.
+    if (diagc)  ic[0] = n+1;
+    else        ic[0] = 0;
+  }
 
-  if (diagc)  ic[0] = n+1;
-  else        ic[0] = 0;
-
   IType minmn = std::min(m,n);
+  IType minlm = std::min(l,m);
 
   for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
 
     for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // merge row lists, walking through columns in each row
 
@@ -1130,82 +1170,155 @@
       if (jj == ia[i+1]) { // Don't really do it the last time -- just handle diagonals in a new yale matrix.
         if (!diaga || i >= minmn) continue;
         j = i;
       } else j = ja[jj];
 
-      for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
+      for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B.
         if (kk == ib[j+1]) {
-          if (!diagb || j >= minmn) continue;
+          if (!diagb || j >= minlm) continue;
           k = j;
         } else k = jb[kk];
 
         if (mask[k] != i) {
           mask[k] = i;
           ++ndnz;
         }
       }
     }
 
-    if (diagc && !mask[i]) --ndnz;
+    if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz;
 
-    ic[i+1] = ndnz;
+    if (ic) ic[i+1] = ndnz;
   }
+
+  return ndnz;
 } /* symbmm_ */
 
 
-//TODO: More efficient sorting algorithm than selection sort would be nice, probably.
-// Remember, we're dealing with unique keys, which simplifies things.
-// Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
-template <typename DType, typename IType>
-inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
-  IType jj, min, min_jj;
-  DType temp_val;
+// In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right.
+namespace smmp_sort {
+  const size_t THRESHOLD = 4;  // switch to insertion sort for 4 elements or fewer
 
-  for (size_t i = 0; i < n; ++i) {
-    // No need to sort if there are 0 or 1 entries in the row
-    if (ia[i+1] - ia[i] < 2) continue;
+  template <typename DType, typename IType>
+  void print_array(DType* vals, IType* array, IType left, IType right) {
+    for (IType i = left; i <= right; ++i) {
+      std::cerr << array[i] << ":" << vals[i] << "  ";
+    }
+    std::cerr << std::endl;
+  }
 
-    for (IType jj_start = ia[i]; jj_start < ia[i+1]; ++jj_start) {
+  template <typename DType, typename IType>
+  IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) {
+    IType pivotJ = array[pivot];
+    DType pivotV = vals[pivot];
 
-      // If the previous min is just current-1, this key/value pair is already in sorted order.
-      // This follows from the unique condition on our column keys.
-      if (jj_start > ia[i] && min+1 == ja[jj_start]) {
-        min    = ja[jj_start];
-        continue;
+    // Swap pivot and right
+    array[pivot] = array[right];
+    vals[pivot]  = vals[right];
+    array[right] = pivotJ;
+    vals[right]  = pivotV;
+
+    IType store = left;
+    for (IType idx = left; idx < right; ++idx) {
+      if (array[idx] <= pivotJ) {
+        // Swap i and store
+        std::swap(array[idx], array[store]);
+        std::swap(vals[idx],  vals[store]);
+        ++store;
       }
+    }
 
-      // find the minimum key (column index) between jj_start and ia[i+1]
-      min    = ja[jj_start];
-      min_jj = jj_start;
-      for (jj = jj_start+1; jj < ia[i+1]; ++jj) {
-        if (ja[jj] < min) {
-          min_jj = jj;
-          min    = ja[jj];
-        }
+    std::swap(array[store], array[right]);
+    std::swap(vals[store],  vals[right]);
+
+    return store;
+  }
+
+  // Recommended to use the median of left, right, and mid for the pivot.
+  template <typename IType>
+  IType median(IType a, IType b, IType c) {
+    if (a < b) {
+      if (b < c) return b; // a b c
+      if (a < c) return c; // a c b
+                 return a; // c a b
+
+    } else { // a > b
+      if (a < c) return a; // b a c
+      if (b < c) return c; // b c a
+                 return b; // c b a
+    }
+  }
+
+
+  // Insertion sort is more efficient than quicksort for small N
+  template <typename DType, typename IType>
+  void insertion_sort(DType* vals, IType* array, IType left, IType right) {
+    for (IType idx = left; idx <= right; ++idx) {
+      IType col_to_insert = array[idx];
+      DType val_to_insert = vals[idx];
+
+      IType hole_pos = idx;
+      for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) {
+        array[hole_pos] = array[hole_pos - 1];  // shift the larger column index up
+        vals[hole_pos]  = vals[hole_pos - 1];   // value goes along with it
       }
 
-      // if min is already first, skip this iteration
-      if (min_jj == jj_start) continue;
+      array[hole_pos] = col_to_insert;
+      vals[hole_pos]  = val_to_insert;
+    }
+  }
 
-      for (jj = jj_start; jj < ia[i+1]; ++jj) {
-        // swap minimum key/value pair with key/value pair in the first position.
-        if (min_jj != jj) {
-          // min already = ja[min_jj], so use this as temp_key
-          temp_val = a[min_jj];
 
-          ja[min_jj] = ja[jj];
-          a[min_jj] = a[jj];
+  template <typename DType, typename IType>
+  void quicksort(DType* vals, IType* array, IType left, IType right) {
 
-          ja[jj] = min;
-          a[jj] = temp_val;
-        }
+    if (left < right) {
+      if (right - left < THRESHOLD) {
+        insertion_sort(vals, array, left, right);
+      } else {
+        // choose any pivot such that left < pivot < right
+        IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2));
+        pivot = partition(vals, array, left, right, pivot);
+
+        // recursively sort elements smaller than the pivot
+        quicksort<DType,IType>(vals, array, left, pivot-1);
+
+        // recursively sort elements at least as big as the pivot
+        quicksort<DType,IType>(vals, array, pivot+1, right);
       }
     }
   }
+
+
+}; // end of namespace smmp_sort
+
+
+/*
+ * For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index.
+ * This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care
+ * about stability).
+ *
+ * TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place.
+ *
+ * TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random
+ * ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion
+ * sort.
+ */
+template <typename DType, typename IType>
+inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
+  for (size_t i = 0; i < n; ++i) {
+    if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements.
+    else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) {
+      smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows
+    } else {
+      smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1);      // faster for large rows (and may call insertion_sort as well)
+    }
+  }
 }
 
 
+
 /*
  * Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
  *
  * Based on transp from SMMP (same as symbmm and numbmm).
  *
@@ -2023,10 +2136,197 @@
 }
 
 
 template <typename DType, typename CSDType>
 inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
-  rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
+  rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
+                       *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
+}
+
+/*
+ * Level 1 BLAS routine which returns the 2-norm of an n-vector x.
+ #
+ * Based on input types, these are the valid return types:
+ *    int -> int
+ *    float -> float or double
+ *    double -> double
+ *    complex64 -> float or double
+ *    complex128 -> double
+ *    rational -> rational
+ */
+template <typename ReturnDType, typename DType>
+ReturnDType nrm2(const int N, const DType* X, const int incX) {
+  const DType ONE = 1, ZERO = 0;
+  typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp;
+
+
+  if ((N < 1) || (incX < 1))    return ZERO;
+  else if (N == 1)              return std::abs(X[0]);
+
+  for (int i = 0; i < N; ++i) {
+    absxi = std::abs(X[i*incX]);
+    if (scale < absxi) {
+      temp  = scale / absxi;
+      scale = absxi;
+      ssq   = ONE + ssq * (temp * temp);
+    } else {
+      temp = absxi / scale;
+      ssq += temp * temp;
+    }
+  }
+
+  return scale * std::sqrt( ssq );
+}
+
+
+#ifdef HAVE_CBLAS_H
+template <>
+inline float nrm2(const int N, const float* X, const int incX) {
+  return cblas_snrm2(N, X, incX);
+}
+
+template <>
+inline double nrm2(const int N, const double* X, const int incX) {
+  return cblas_dnrm2(N, X, incX);
+}
+
+template <>
+inline float nrm2(const int N, const Complex64* X, const int incX) {
+  return cblas_scnrm2(N, X, incX);
+}
+
+template <>
+inline double nrm2(const int N, const Complex128* X, const int incX) {
+  return cblas_dznrm2(N, X, incX);
+}
+#else
+template <typename FloatDType>
+static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) {
+  double absx = std::abs(xr);
+  if (scale < absx) {
+    double temp  = scale / absx;
+    scale = absx;
+    ssq   = 1.0 + ssq * (temp * temp);
+  } else {
+    double temp = absx / scale;
+    ssq += temp * temp;
+  }
+
+  absx = std::abs(xi);
+  if (scale < absx) {
+    double temp  = scale / absx;
+    scale = absx;
+    ssq   = 1.0 + ssq * (temp * temp);
+  } else {
+    double temp = absx / scale;
+    ssq += temp * temp;
+  }
+}
+
+template <>
+float nrm2(const int N, const Complex64* X, const int incX) {
+  double scale = 0, ssq = 1, temp;
+
+  if ((N < 1) || (incX < 1))    return 0.0;
+
+  for (int i = 0; i < N; ++i) {
+    nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp);
+  }
+
+  return scale * std::sqrt( ssq );
+}
+
+template <>
+double nrm2(const int N, const Complex128* X, const int incX) {
+  double scale = 0, ssq = 1, temp;
+
+  if ((N < 1) || (incX < 1))    return 0.0;
+
+  for (int i = 0; i < N; ++i) {
+    nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp);
+  }
+
+  return scale * std::sqrt( ssq );
+}
+#endif
+
+template <typename ReturnDType, typename DType>
+inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) {
+  *reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
+}
+
+/*
+ * Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values,
+ * the routine sums the absolute values of the real and imaginary components as well.
+ *
+ * So, based on input types, these are the valid return types:
+ *    int -> int
+ *    float -> float or double
+ *    double -> double
+ *    complex64 -> float or double
+ *    complex128 -> double
+ *    rational -> rational
+ */
+template <typename ReturnDType, typename DType>
+inline ReturnDType asum(const int N, const DType* X, const int incX) {
+  ReturnDType sum = 0;
+  if ((N > 0) && (incX > 0)) {
+    for (int i = 0; i < N; ++i) {
+      sum += std::abs(X[i*incX]);
+    }
+  }
+  return sum;
+}
+
+
+#ifdef HAVE_CBLAS_H
+template <>
+inline float asum(const int N, const float* X, const int incX) {
+  return cblas_sasum(N, X, incX);
+}
+
+template <>
+inline double asum(const int N, const double* X, const int incX) {
+  return cblas_dasum(N, X, incX);
+}
+
+template <>
+inline float asum(const int N, const Complex64* X, const int incX) {
+  return cblas_scasum(N, X, incX);
+}
+
+template <>
+inline double asum(const int N, const Complex128* X, const int incX) {
+  return cblas_dzasum(N, X, incX);
+}
+#else
+template <>
+inline float asum(const int N, const Complex64* X, const int incX) {
+  float sum = 0;
+  if ((N > 0) && (incX > 0)) {
+    for (int i = 0; i < N; ++i) {
+      sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
+    }
+  }
+  return sum;
+}
+
+template <>
+inline double asum(const int N, const Complex128* X, const int incX) {
+  double sum = 0;
+  if ((N > 0) && (incX > 0)) {
+    for (int i = 0; i < N; ++i) {
+      sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
+    }
+  }
+  return sum;
+}
+#endif
+
+
+template <typename ReturnDType, typename DType>
+inline void cblas_asum(const int N, const void* X, const int incX, void* sum) {
+  *reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
 }
 
 
 template <bool is_complex, typename DType>
 inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, DType* A, const int lda) {