ext/nmatrix/util/math.h in nmatrix-0.0.4 vs ext/nmatrix/util/math.h in nmatrix-0.0.5
- old
+ new
@@ -1024,51 +1024,49 @@
}
// Yale: numeric matrix multiply c=a*b
template <typename DType, typename IType>
-inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
+inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga,
const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
- IType next[m];
- DType sums[m];
+ const unsigned int max_lmn = std::max(std::max(m, n), l);
+ IType next[max_lmn];
+ DType sums[max_lmn];
DType v;
IType head, length, temp, ndnz = 0;
- IType jj_start, jj_end, kk_start, kk_end;
- IType i, j, k, kk, jj;
IType minmn = std::min(m,n);
+ IType minlm = std::min(l,m);
- for (i = 0; i < m; ++i) { // initialize scratch arrays
- next[i] = std::numeric_limits<IType>::max();
- sums[i] = 0;
+ for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays
+ next[idx] = std::numeric_limits<IType>::max();
+ sums[idx] = 0;
}
- for (i = 0; i < n; ++i) { // walk down the rows
+ for (IType i = 0; i < n; ++i) { // walk down the rows
head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
length = 0;
- jj_start = ia[i];
- jj_end = ia[i+1];
+ for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row
+ IType j;
- for (jj = jj_start; jj <= jj_end; ++jj) { // walk through entries in each row
-
- if (jj == jj_end) { // if we're in the last entry for this row:
+ if (jj == ia[i+1]) { // if we're in the last entry for this row:
if (!diaga || i >= minmn) continue;
j = i; // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
v = a[i];
} else {
j = ja[jj]; // if it's not the last entry for this row, get the column (j) and entry (ajj)
v = a[jj];
}
- kk_start = ib[j]; // Find the first entry of row j of matrix B
- kk_end = ib[j+1];
- for (kk = kk_start; kk <= kk_end; ++kk) {
+ for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) {
- if (kk == kk_end) { // Get the column id for that entry
- if (!diagb || j >= minmn) continue;
+ IType k;
+
+ if (kk == ib[j+1]) { // Get the column id for that entry
+ if (!diagb || j >= minlm) continue;
k = j;
sums[k] += v*b[k];
} else {
k = jb[kk];
sums[k] += v*b[kk];
@@ -1077,14 +1075,14 @@
if (next[k] == std::numeric_limits<IType>::max()) {
next[k] = head;
head = k;
++length;
}
- }
- }
+ } // end of kk loop
+ } // end of jj loop
- for (jj = 0; jj < length; ++jj) {
+ for (IType jj = 0; jj < length; ++jj) {
if (sums[head] != 0) {
if (diagc && head == i) {
c[head] = sums[head];
} else {
jc[n+1+ndnz] = head;
@@ -1103,26 +1101,68 @@
ic[i+1] = n+1+ndnz;
}
} /* numbmm_ */
+/*
+template <typename DType, typename IType>
+inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) {
+ unsigned int n = c_storage->shape[0],
+ l = c_storage->shape[1];
+ // Create a working vector of dimension max(m,l,n) and initial value IType::max():
+ std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max());
+
+ for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i|
+
+ IType j, k;
+ size_t ndnz;
+
+ for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A
+ j = (jj == ija[i+1]) ? i : ija[jj]; // Get the current column index (handle diagonals last)
+
+ if (j >= m) {
+ if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j);
+ else break;
+ }
+
+ for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B
+ if (j >= m) continue; // first of all, does B *have* a row j?
+ k = (kk == ijb[j+1]) ? j : ijb[kk]; // Get the current column index (handle diagonals last)
+
+ if (k >= l) {
+ if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k);
+ else break;
+ }
+
+ if (mask[k] == )
+ }
+
+ }
+ }
+}
+*/
+
// Yale: Symbolic matrix multiply c=a*b
template <typename IType>
-inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const bool diaga,
+inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga,
const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
- IType mask[m];
- IType j, k, ndnz = n; /* Local variables */
+ unsigned int max_lmn = std::max(std::max(m,n), l);
+ IType mask[max_lmn]; // INDEX in the SMMP paper.
+ IType j, k; /* Local variables */
+ size_t ndnz = n;
+ for (IType idx = 0; idx < max_lmn; ++idx)
+ mask[idx] = std::numeric_limits<IType>::max();
- for (j = 0; j < m; ++j)
- mask[j] = std::numeric_limits<IType>::max();
+ if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting.
+ if (diagc) ic[0] = n+1;
+ else ic[0] = 0;
+ }
- if (diagc) ic[0] = n+1;
- else ic[0] = 0;
-
IType minmn = std::min(m,n);
+ IType minlm = std::min(l,m);
for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // merge row lists, walking through columns in each row
@@ -1130,82 +1170,155 @@
if (jj == ia[i+1]) { // Don't really do it the last time -- just handle diagonals in a new yale matrix.
if (!diaga || i >= minmn) continue;
j = i;
} else j = ja[jj];
- for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
+ for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B.
if (kk == ib[j+1]) {
- if (!diagb || j >= minmn) continue;
+ if (!diagb || j >= minlm) continue;
k = j;
} else k = jb[kk];
if (mask[k] != i) {
mask[k] = i;
++ndnz;
}
}
}
- if (diagc && !mask[i]) --ndnz;
+ if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz;
- ic[i+1] = ndnz;
+ if (ic) ic[i+1] = ndnz;
}
+
+ return ndnz;
} /* symbmm_ */
-//TODO: More efficient sorting algorithm than selection sort would be nice, probably.
-// Remember, we're dealing with unique keys, which simplifies things.
-// Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
-template <typename DType, typename IType>
-inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
- IType jj, min, min_jj;
- DType temp_val;
+// In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right.
+namespace smmp_sort {
+ const size_t THRESHOLD = 4; // switch to insertion sort for 4 elements or fewer
- for (size_t i = 0; i < n; ++i) {
- // No need to sort if there are 0 or 1 entries in the row
- if (ia[i+1] - ia[i] < 2) continue;
+ template <typename DType, typename IType>
+ void print_array(DType* vals, IType* array, IType left, IType right) {
+ for (IType i = left; i <= right; ++i) {
+ std::cerr << array[i] << ":" << vals[i] << " ";
+ }
+ std::cerr << std::endl;
+ }
- for (IType jj_start = ia[i]; jj_start < ia[i+1]; ++jj_start) {
+ template <typename DType, typename IType>
+ IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) {
+ IType pivotJ = array[pivot];
+ DType pivotV = vals[pivot];
- // If the previous min is just current-1, this key/value pair is already in sorted order.
- // This follows from the unique condition on our column keys.
- if (jj_start > ia[i] && min+1 == ja[jj_start]) {
- min = ja[jj_start];
- continue;
+ // Swap pivot and right
+ array[pivot] = array[right];
+ vals[pivot] = vals[right];
+ array[right] = pivotJ;
+ vals[right] = pivotV;
+
+ IType store = left;
+ for (IType idx = left; idx < right; ++idx) {
+ if (array[idx] <= pivotJ) {
+ // Swap i and store
+ std::swap(array[idx], array[store]);
+ std::swap(vals[idx], vals[store]);
+ ++store;
}
+ }
- // find the minimum key (column index) between jj_start and ia[i+1]
- min = ja[jj_start];
- min_jj = jj_start;
- for (jj = jj_start+1; jj < ia[i+1]; ++jj) {
- if (ja[jj] < min) {
- min_jj = jj;
- min = ja[jj];
- }
+ std::swap(array[store], array[right]);
+ std::swap(vals[store], vals[right]);
+
+ return store;
+ }
+
+ // Recommended to use the median of left, right, and mid for the pivot.
+ template <typename IType>
+ IType median(IType a, IType b, IType c) {
+ if (a < b) {
+ if (b < c) return b; // a b c
+ if (a < c) return c; // a c b
+ return a; // c a b
+
+ } else { // a > b
+ if (a < c) return a; // b a c
+ if (b < c) return c; // b c a
+ return b; // c b a
+ }
+ }
+
+
+ // Insertion sort is more efficient than quicksort for small N
+ template <typename DType, typename IType>
+ void insertion_sort(DType* vals, IType* array, IType left, IType right) {
+ for (IType idx = left; idx <= right; ++idx) {
+ IType col_to_insert = array[idx];
+ DType val_to_insert = vals[idx];
+
+ IType hole_pos = idx;
+ for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) {
+ array[hole_pos] = array[hole_pos - 1]; // shift the larger column index up
+ vals[hole_pos] = vals[hole_pos - 1]; // value goes along with it
}
- // if min is already first, skip this iteration
- if (min_jj == jj_start) continue;
+ array[hole_pos] = col_to_insert;
+ vals[hole_pos] = val_to_insert;
+ }
+ }
- for (jj = jj_start; jj < ia[i+1]; ++jj) {
- // swap minimum key/value pair with key/value pair in the first position.
- if (min_jj != jj) {
- // min already = ja[min_jj], so use this as temp_key
- temp_val = a[min_jj];
- ja[min_jj] = ja[jj];
- a[min_jj] = a[jj];
+ template <typename DType, typename IType>
+ void quicksort(DType* vals, IType* array, IType left, IType right) {
- ja[jj] = min;
- a[jj] = temp_val;
- }
+ if (left < right) {
+ if (right - left < THRESHOLD) {
+ insertion_sort(vals, array, left, right);
+ } else {
+ // choose any pivot such that left < pivot < right
+ IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2));
+ pivot = partition(vals, array, left, right, pivot);
+
+ // recursively sort elements smaller than the pivot
+ quicksort<DType,IType>(vals, array, left, pivot-1);
+
+ // recursively sort elements at least as big as the pivot
+ quicksort<DType,IType>(vals, array, pivot+1, right);
}
}
}
+
+
+}; // end of namespace smmp_sort
+
+
+/*
+ * For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index.
+ * This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care
+ * about stability).
+ *
+ * TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place.
+ *
+ * TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random
+ * ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion
+ * sort.
+ */
+template <typename DType, typename IType>
+inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
+ for (size_t i = 0; i < n; ++i) {
+ if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements.
+ else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) {
+ smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows
+ } else {
+ smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for large rows (and may call insertion_sort as well)
+ }
+ }
}
+
/*
* Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
*
* Based on transp from SMMP (same as symbmm and numbmm).
*
@@ -2023,10 +2136,197 @@
}
template <typename DType, typename CSDType>
inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
- rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
+ rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
+ *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
+}
+
+/*
+ * Level 1 BLAS routine which returns the 2-norm of an n-vector x.
+ #
+ * Based on input types, these are the valid return types:
+ * int -> int
+ * float -> float or double
+ * double -> double
+ * complex64 -> float or double
+ * complex128 -> double
+ * rational -> rational
+ */
+template <typename ReturnDType, typename DType>
+ReturnDType nrm2(const int N, const DType* X, const int incX) {
+ const DType ONE = 1, ZERO = 0;
+ typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp;
+
+
+ if ((N < 1) || (incX < 1)) return ZERO;
+ else if (N == 1) return std::abs(X[0]);
+
+ for (int i = 0; i < N; ++i) {
+ absxi = std::abs(X[i*incX]);
+ if (scale < absxi) {
+ temp = scale / absxi;
+ scale = absxi;
+ ssq = ONE + ssq * (temp * temp);
+ } else {
+ temp = absxi / scale;
+ ssq += temp * temp;
+ }
+ }
+
+ return scale * std::sqrt( ssq );
+}
+
+
+#ifdef HAVE_CBLAS_H
+template <>
+inline float nrm2(const int N, const float* X, const int incX) {
+ return cblas_snrm2(N, X, incX);
+}
+
+template <>
+inline double nrm2(const int N, const double* X, const int incX) {
+ return cblas_dnrm2(N, X, incX);
+}
+
+template <>
+inline float nrm2(const int N, const Complex64* X, const int incX) {
+ return cblas_scnrm2(N, X, incX);
+}
+
+template <>
+inline double nrm2(const int N, const Complex128* X, const int incX) {
+ return cblas_dznrm2(N, X, incX);
+}
+#else
+template <typename FloatDType>
+static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) {
+ double absx = std::abs(xr);
+ if (scale < absx) {
+ double temp = scale / absx;
+ scale = absx;
+ ssq = 1.0 + ssq * (temp * temp);
+ } else {
+ double temp = absx / scale;
+ ssq += temp * temp;
+ }
+
+ absx = std::abs(xi);
+ if (scale < absx) {
+ double temp = scale / absx;
+ scale = absx;
+ ssq = 1.0 + ssq * (temp * temp);
+ } else {
+ double temp = absx / scale;
+ ssq += temp * temp;
+ }
+}
+
+template <>
+float nrm2(const int N, const Complex64* X, const int incX) {
+ double scale = 0, ssq = 1, temp;
+
+ if ((N < 1) || (incX < 1)) return 0.0;
+
+ for (int i = 0; i < N; ++i) {
+ nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp);
+ }
+
+ return scale * std::sqrt( ssq );
+}
+
+template <>
+double nrm2(const int N, const Complex128* X, const int incX) {
+ double scale = 0, ssq = 1, temp;
+
+ if ((N < 1) || (incX < 1)) return 0.0;
+
+ for (int i = 0; i < N; ++i) {
+ nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp);
+ }
+
+ return scale * std::sqrt( ssq );
+}
+#endif
+
+template <typename ReturnDType, typename DType>
+inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) {
+ *reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
+}
+
+/*
+ * Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values,
+ * the routine sums the absolute values of the real and imaginary components as well.
+ *
+ * So, based on input types, these are the valid return types:
+ * int -> int
+ * float -> float or double
+ * double -> double
+ * complex64 -> float or double
+ * complex128 -> double
+ * rational -> rational
+ */
+template <typename ReturnDType, typename DType>
+inline ReturnDType asum(const int N, const DType* X, const int incX) {
+ ReturnDType sum = 0;
+ if ((N > 0) && (incX > 0)) {
+ for (int i = 0; i < N; ++i) {
+ sum += std::abs(X[i*incX]);
+ }
+ }
+ return sum;
+}
+
+
+#ifdef HAVE_CBLAS_H
+template <>
+inline float asum(const int N, const float* X, const int incX) {
+ return cblas_sasum(N, X, incX);
+}
+
+template <>
+inline double asum(const int N, const double* X, const int incX) {
+ return cblas_dasum(N, X, incX);
+}
+
+template <>
+inline float asum(const int N, const Complex64* X, const int incX) {
+ return cblas_scasum(N, X, incX);
+}
+
+template <>
+inline double asum(const int N, const Complex128* X, const int incX) {
+ return cblas_dzasum(N, X, incX);
+}
+#else
+template <>
+inline float asum(const int N, const Complex64* X, const int incX) {
+ float sum = 0;
+ if ((N > 0) && (incX > 0)) {
+ for (int i = 0; i < N; ++i) {
+ sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
+ }
+ }
+ return sum;
+}
+
+template <>
+inline double asum(const int N, const Complex128* X, const int incX) {
+ double sum = 0;
+ if ((N > 0) && (incX > 0)) {
+ for (int i = 0; i < N; ++i) {
+ sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
+ }
+ }
+ return sum;
+}
+#endif
+
+
+template <typename ReturnDType, typename DType>
+inline void cblas_asum(const int N, const void* X, const int incX, void* sum) {
+ *reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
}
template <bool is_complex, typename DType>
inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, DType* A, const int lda) {