ext/nmatrix/util/math.h in nmatrix-0.0.4 vs ext/nmatrix/util/math.h in nmatrix-0.0.5

- old
+ new

@@ -1024,51 +1024,49 @@ } // Yale: numeric matrix multiply c=a*b template <typename DType, typename IType> -inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga, +inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga, const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) { - IType next[m]; - DType sums[m]; + const unsigned int max_lmn = std::max(std::max(m, n), l); + IType next[max_lmn]; + DType sums[max_lmn]; DType v; IType head, length, temp, ndnz = 0; - IType jj_start, jj_end, kk_start, kk_end; - IType i, j, k, kk, jj; IType minmn = std::min(m,n); + IType minlm = std::min(l,m); - for (i = 0; i < m; ++i) { // initialize scratch arrays - next[i] = std::numeric_limits<IType>::max(); - sums[i] = 0; + for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays + next[idx] = std::numeric_limits<IType>::max(); + sums[idx] = 0; } - for (i = 0; i < n; ++i) { // walk down the rows + for (IType i = 0; i < n; ++i) { // walk down the rows head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited length = 0; - jj_start = ia[i]; - jj_end = ia[i+1]; + for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row + IType j; - for (jj = jj_start; jj <= jj_end; ++jj) { // walk through entries in each row - - if (jj == jj_end) { // if we're in the last entry for this row: + if (jj == ia[i+1]) { // if we're in the last entry for this row: if (!diaga || i >= minmn) continue; j = i; // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj) v = a[i]; } else { j = ja[jj]; // if it's not the last entry for this row, get the column (j) and entry (ajj) v = a[jj]; } - kk_start = ib[j]; // Find the first entry of row j of matrix B - kk_end = ib[j+1]; - for (kk = kk_start; kk <= kk_end; ++kk) { + for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { - if (kk == kk_end) { // Get the column id for that entry - if (!diagb || j >= minmn) continue; + IType k; + + if (kk == ib[j+1]) { // Get the column id for that entry + if (!diagb || j >= minlm) continue; k = j; sums[k] += v*b[k]; } else { k = jb[kk]; sums[k] += v*b[kk]; @@ -1077,14 +1075,14 @@ if (next[k] == std::numeric_limits<IType>::max()) { next[k] = head; head = k; ++length; } - } - } + } // end of kk loop + } // end of jj loop - for (jj = 0; jj < length; ++jj) { + for (IType jj = 0; jj < length; ++jj) { if (sums[head] != 0) { if (diagc && head == i) { c[head] = sums[head]; } else { jc[n+1+ndnz] = head; @@ -1103,26 +1101,68 @@ ic[i+1] = n+1+ndnz; } } /* numbmm_ */ +/* +template <typename DType, typename IType> +inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) { + unsigned int n = c_storage->shape[0], + l = c_storage->shape[1]; + // Create a working vector of dimension max(m,l,n) and initial value IType::max(): + std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max()); + + for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i| + + IType j, k; + size_t ndnz; + + for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A + j = (jj == ija[i+1]) ? i : ija[jj]; // Get the current column index (handle diagonals last) + + if (j >= m) { + if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j); + else break; + } + + for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B + if (j >= m) continue; // first of all, does B *have* a row j? + k = (kk == ijb[j+1]) ? j : ijb[kk]; // Get the current column index (handle diagonals last) + + if (k >= l) { + if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k); + else break; + } + + if (mask[k] == ) + } + + } + } +} +*/ + // Yale: Symbolic matrix multiply c=a*b template <typename IType> -inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const bool diaga, +inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga, const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) { - IType mask[m]; - IType j, k, ndnz = n; /* Local variables */ + unsigned int max_lmn = std::max(std::max(m,n), l); + IType mask[max_lmn]; // INDEX in the SMMP paper. + IType j, k; /* Local variables */ + size_t ndnz = n; + for (IType idx = 0; idx < max_lmn; ++idx) + mask[idx] = std::numeric_limits<IType>::max(); - for (j = 0; j < m; ++j) - mask[j] = std::numeric_limits<IType>::max(); + if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting. + if (diagc) ic[0] = n+1; + else ic[0] = 0; + } - if (diagc) ic[0] = n+1; - else ic[0] = 0; - IType minmn = std::min(m,n); + IType minlm = std::min(l,m); for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // merge row lists, walking through columns in each row @@ -1130,82 +1170,155 @@ if (jj == ia[i+1]) { // Don't really do it the last time -- just handle diagonals in a new yale matrix. if (!diaga || i >= minmn) continue; j = i; } else j = ja[jj]; - for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B. + for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B. if (kk == ib[j+1]) { - if (!diagb || j >= minmn) continue; + if (!diagb || j >= minlm) continue; k = j; } else k = jb[kk]; if (mask[k] != i) { mask[k] = i; ++ndnz; } } } - if (diagc && !mask[i]) --ndnz; + if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz; - ic[i+1] = ndnz; + if (ic) ic[i+1] = ndnz; } + + return ndnz; } /* symbmm_ */ -//TODO: More efficient sorting algorithm than selection sort would be nice, probably. -// Remember, we're dealing with unique keys, which simplifies things. -// Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place. -template <typename DType, typename IType> -inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) { - IType jj, min, min_jj; - DType temp_val; +// In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right. +namespace smmp_sort { + const size_t THRESHOLD = 4; // switch to insertion sort for 4 elements or fewer - for (size_t i = 0; i < n; ++i) { - // No need to sort if there are 0 or 1 entries in the row - if (ia[i+1] - ia[i] < 2) continue; + template <typename DType, typename IType> + void print_array(DType* vals, IType* array, IType left, IType right) { + for (IType i = left; i <= right; ++i) { + std::cerr << array[i] << ":" << vals[i] << " "; + } + std::cerr << std::endl; + } - for (IType jj_start = ia[i]; jj_start < ia[i+1]; ++jj_start) { + template <typename DType, typename IType> + IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) { + IType pivotJ = array[pivot]; + DType pivotV = vals[pivot]; - // If the previous min is just current-1, this key/value pair is already in sorted order. - // This follows from the unique condition on our column keys. - if (jj_start > ia[i] && min+1 == ja[jj_start]) { - min = ja[jj_start]; - continue; + // Swap pivot and right + array[pivot] = array[right]; + vals[pivot] = vals[right]; + array[right] = pivotJ; + vals[right] = pivotV; + + IType store = left; + for (IType idx = left; idx < right; ++idx) { + if (array[idx] <= pivotJ) { + // Swap i and store + std::swap(array[idx], array[store]); + std::swap(vals[idx], vals[store]); + ++store; } + } - // find the minimum key (column index) between jj_start and ia[i+1] - min = ja[jj_start]; - min_jj = jj_start; - for (jj = jj_start+1; jj < ia[i+1]; ++jj) { - if (ja[jj] < min) { - min_jj = jj; - min = ja[jj]; - } + std::swap(array[store], array[right]); + std::swap(vals[store], vals[right]); + + return store; + } + + // Recommended to use the median of left, right, and mid for the pivot. + template <typename IType> + IType median(IType a, IType b, IType c) { + if (a < b) { + if (b < c) return b; // a b c + if (a < c) return c; // a c b + return a; // c a b + + } else { // a > b + if (a < c) return a; // b a c + if (b < c) return c; // b c a + return b; // c b a + } + } + + + // Insertion sort is more efficient than quicksort for small N + template <typename DType, typename IType> + void insertion_sort(DType* vals, IType* array, IType left, IType right) { + for (IType idx = left; idx <= right; ++idx) { + IType col_to_insert = array[idx]; + DType val_to_insert = vals[idx]; + + IType hole_pos = idx; + for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) { + array[hole_pos] = array[hole_pos - 1]; // shift the larger column index up + vals[hole_pos] = vals[hole_pos - 1]; // value goes along with it } - // if min is already first, skip this iteration - if (min_jj == jj_start) continue; + array[hole_pos] = col_to_insert; + vals[hole_pos] = val_to_insert; + } + } - for (jj = jj_start; jj < ia[i+1]; ++jj) { - // swap minimum key/value pair with key/value pair in the first position. - if (min_jj != jj) { - // min already = ja[min_jj], so use this as temp_key - temp_val = a[min_jj]; - ja[min_jj] = ja[jj]; - a[min_jj] = a[jj]; + template <typename DType, typename IType> + void quicksort(DType* vals, IType* array, IType left, IType right) { - ja[jj] = min; - a[jj] = temp_val; - } + if (left < right) { + if (right - left < THRESHOLD) { + insertion_sort(vals, array, left, right); + } else { + // choose any pivot such that left < pivot < right + IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2)); + pivot = partition(vals, array, left, right, pivot); + + // recursively sort elements smaller than the pivot + quicksort<DType,IType>(vals, array, left, pivot-1); + + // recursively sort elements at least as big as the pivot + quicksort<DType,IType>(vals, array, pivot+1, right); } } } + + +}; // end of namespace smmp_sort + + +/* + * For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index. + * This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care + * about stability). + * + * TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place. + * + * TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random + * ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion + * sort. + */ +template <typename DType, typename IType> +inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) { + for (size_t i = 0; i < n; ++i) { + if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements. + else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) { + smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows + } else { + smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for large rows (and may call insertion_sort as well) + } + } } + /* * Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true. * * Based on transp from SMMP (same as symbmm and numbmm). * @@ -2023,10 +2136,197 @@ } template <typename DType, typename CSDType> inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) { - rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s)); + rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, + *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s)); +} + +/* + * Level 1 BLAS routine which returns the 2-norm of an n-vector x. + # + * Based on input types, these are the valid return types: + * int -> int + * float -> float or double + * double -> double + * complex64 -> float or double + * complex128 -> double + * rational -> rational + */ +template <typename ReturnDType, typename DType> +ReturnDType nrm2(const int N, const DType* X, const int incX) { + const DType ONE = 1, ZERO = 0; + typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp; + + + if ((N < 1) || (incX < 1)) return ZERO; + else if (N == 1) return std::abs(X[0]); + + for (int i = 0; i < N; ++i) { + absxi = std::abs(X[i*incX]); + if (scale < absxi) { + temp = scale / absxi; + scale = absxi; + ssq = ONE + ssq * (temp * temp); + } else { + temp = absxi / scale; + ssq += temp * temp; + } + } + + return scale * std::sqrt( ssq ); +} + + +#ifdef HAVE_CBLAS_H +template <> +inline float nrm2(const int N, const float* X, const int incX) { + return cblas_snrm2(N, X, incX); +} + +template <> +inline double nrm2(const int N, const double* X, const int incX) { + return cblas_dnrm2(N, X, incX); +} + +template <> +inline float nrm2(const int N, const Complex64* X, const int incX) { + return cblas_scnrm2(N, X, incX); +} + +template <> +inline double nrm2(const int N, const Complex128* X, const int incX) { + return cblas_dznrm2(N, X, incX); +} +#else +template <typename FloatDType> +static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) { + double absx = std::abs(xr); + if (scale < absx) { + double temp = scale / absx; + scale = absx; + ssq = 1.0 + ssq * (temp * temp); + } else { + double temp = absx / scale; + ssq += temp * temp; + } + + absx = std::abs(xi); + if (scale < absx) { + double temp = scale / absx; + scale = absx; + ssq = 1.0 + ssq * (temp * temp); + } else { + double temp = absx / scale; + ssq += temp * temp; + } +} + +template <> +float nrm2(const int N, const Complex64* X, const int incX) { + double scale = 0, ssq = 1, temp; + + if ((N < 1) || (incX < 1)) return 0.0; + + for (int i = 0; i < N; ++i) { + nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp); + } + + return scale * std::sqrt( ssq ); +} + +template <> +double nrm2(const int N, const Complex128* X, const int incX) { + double scale = 0, ssq = 1, temp; + + if ((N < 1) || (incX < 1)) return 0.0; + + for (int i = 0; i < N; ++i) { + nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp); + } + + return scale * std::sqrt( ssq ); +} +#endif + +template <typename ReturnDType, typename DType> +inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) { + *reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX ); +} + +/* + * Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values, + * the routine sums the absolute values of the real and imaginary components as well. + * + * So, based on input types, these are the valid return types: + * int -> int + * float -> float or double + * double -> double + * complex64 -> float or double + * complex128 -> double + * rational -> rational + */ +template <typename ReturnDType, typename DType> +inline ReturnDType asum(const int N, const DType* X, const int incX) { + ReturnDType sum = 0; + if ((N > 0) && (incX > 0)) { + for (int i = 0; i < N; ++i) { + sum += std::abs(X[i*incX]); + } + } + return sum; +} + + +#ifdef HAVE_CBLAS_H +template <> +inline float asum(const int N, const float* X, const int incX) { + return cblas_sasum(N, X, incX); +} + +template <> +inline double asum(const int N, const double* X, const int incX) { + return cblas_dasum(N, X, incX); +} + +template <> +inline float asum(const int N, const Complex64* X, const int incX) { + return cblas_scasum(N, X, incX); +} + +template <> +inline double asum(const int N, const Complex128* X, const int incX) { + return cblas_dzasum(N, X, incX); +} +#else +template <> +inline float asum(const int N, const Complex64* X, const int incX) { + float sum = 0; + if ((N > 0) && (incX > 0)) { + for (int i = 0; i < N; ++i) { + sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i); + } + } + return sum; +} + +template <> +inline double asum(const int N, const Complex128* X, const int incX) { + double sum = 0; + if ((N > 0) && (incX > 0)) { + for (int i = 0; i < N; ++i) { + sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i); + } + } + return sum; +} +#endif + + +template <typename ReturnDType, typename DType> +inline void cblas_asum(const int N, const void* X, const int incX, void* sum) { + *reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX ); } template <bool is_complex, typename DType> inline void lauum(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo, const int N, DType* A, const int lda) {