vendor/scs/linsys/gpu/indirect/private.c in scs-0.2.3 vs vendor/scs/linsys/gpu/indirect/private.c in scs-0.3.0
- old
+ new
@@ -1,80 +1,103 @@
#include "private.h"
+#include "linsys.h"
-#define CG_BEST_TOL 1e-9
-#define CG_MIN_TOL 1e-1
+/* norm to use when deciding convergence */
+/* should be consistent with CG_NORM in glbopts.h */
+#define USE_L2_NORM (0)
-/* do not use within pcg, reuses memory */
-void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
- const scs_float *x, scs_float *y) {
- scs_float *v_m = p->tmp_m;
- scs_float *v_n = p->r;
- cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
- cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
-
- cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
- cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
- SCS(_accum_by_atrans_gpu)(
- p->Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
-
- cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
-}
-
-/* do not use within pcg, reuses memory */
-void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
- scs_float *y) {
- scs_float *v_m = p->tmp_m;
- scs_float *v_n = p->r;
- cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
- cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
-
- cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
- cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
-#if GPU_TRANSPOSE_MAT > 0
- SCS(_accum_by_atrans_gpu)(
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
+static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
+ scs_int n) {
+#if USE_L2_NORM > 0
+ scs_float nrm;
+ CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
#else
- SCS(_accum_by_a_gpu)(
- p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
+ scs_int idx;
+ scs_float nrm;
+ CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
+ /* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
+ cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
+ nrm = ABS(nrm);
#endif
-
- cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
+ return nrm;
}
-char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
- char *str = (char *)scs_malloc(sizeof(char) * 128);
- sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
- (long)A->p[A->n], stgs->cg_rate);
- return str;
+const char *SCS(get_lin_sys_method)() {
+ return "sparse-indirect GPU";
}
+/*
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
char *str = (char *)scs_malloc(sizeof(char) * 128);
- sprintf(str,
- "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
- (scs_float)p->tot_cg_its / (info->iter + 1),
- p->total_solve_time / (info->iter + 1) / 1e3);
+ sprintf(str, "lin-sys: avg cg its: %2.2f\n",
+ (scs_float)p->tot_cg_its / (info->iter + 1));
p->tot_cg_its = 0;
- p->total_solve_time = 0;
return str;
}
+*/
+/* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
+static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
+ scs_int i, k;
+ const ScsMatrix *A = p->A;
+ const ScsMatrix *P = p->P;
+ scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
+
+#if VERBOSITY > 0
+ scs_printf("getting pre-conditioner\n");
+#endif
+
+ for (i = 0; i < A->n; ++i) { /* cols */
+ M[i] = p->rho_x;
+ /* diag(A' R_y^{-1} A) */
+ for (k = A->p[i]; k < A->p[i + 1]; ++k) {
+ /* A->i[k] is row of entry k with value A->x[k] */
+ M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
+ }
+ if (P) {
+ for (k = P->p[i]; k < P->p[i + 1]; k++) {
+ /* diagonal element only */
+ if (P->i[k] == i) { /* row == col */
+ M[i] += P->x[k];
+ break;
+ }
+ }
+ }
+ M[i] = 1. / M[i];
+ }
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
+ scs_free(M);
+#if VERBOSITY > 0
+ scs_printf("finished getting pre-conditioner\n");
+#endif
+}
+
+/* no need to update anything in this case */
+void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
+ scs_int i;
+ for (i = 0; i < p->m; ++i)
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
+ cudaMemcpyHostToDevice);
+ set_preconditioner(p, rho_y_vec);
+}
+
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
if (p) {
+ scs_free(p->inv_rho_y_vec);
cudaFree(p->p);
cudaFree(p->r);
cudaFree(p->Gp);
cudaFree(p->bg);
cudaFree(p->tmp_m);
cudaFree(p->z);
cudaFree(p->M);
+ cudaFree(p->inv_rho_y_vec_gpu);
+ if (p->Pg) {
+ SCS(free_gpu_matrix)(p->Pg);
+ scs_free(p->Pg);
+ }
if (p->Ag) {
SCS(free_gpu_matrix)(p->Ag);
scs_free(p->Ag);
}
if (p->Agt) {
@@ -84,92 +107,118 @@
if (p->buffer != SCS_NULL) {
cudaFree(p->buffer);
}
cusparseDestroyDnVec(p->dn_vec_m);
cusparseDestroyDnVec(p->dn_vec_n);
+ cusparseDestroyDnVec(p->dn_vec_n_p);
cusparseDestroy(p->cusparse_handle);
cublasDestroy(p->cublas_handle);
/* Don't reset because it interferes with other GPU programs. */
/* cudaDeviceReset(); */
scs_free(p);
}
}
-/*y = (RHO_X * I + A'A)x */
-static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
- ScsLinSysWork *p, const scs_float *x, scs_float *y) {
+/* z = M * z elementwise in place, assumes M, z on GPU */
+static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
+ scs_float *z, scs_int n) {
+ CUBLAS(tbmv)
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
+ 0, M, 1, z, 1);
+}
+
+/* y = (rho_x * I + P + A' R_y^{-1} A) x */
+static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
/* x and y MUST already be loaded to GPU */
- scs_float *tmp_m = p->tmp_m; /* temp memory */
- cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
+ scs_float *z = p->tmp_m; /* temp memory */
+ cudaMemset(y, 0, p->n * sizeof(scs_float));
+ cudaMemset(z, 0, p->m * sizeof(scs_float));
- cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
- cusparseDnVecSetValues(p->dn_vec_n, (void *) x);
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
+ cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
+
+ /* y = rho_x * x */
+ CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
+
+ if (p->Pg) {
+ /* y = rho_x * x + Px */
+ SCS(accum_by_p_gpu)
+ (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
+ &p->buffer);
+ }
+
+ /* z = Ax */
#if GPU_TRANSPOSE_MAT > 0
- SCS(_accum_by_atrans_gpu)(
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
+ SCS(accum_by_atrans_gpu)
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
+ &p->buffer);
#else
- SCS(_accum_by_a_gpu)(
- A, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
+ SCS(accum_by_a_gpu)
+ (p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
+ &p->buffer);
#endif
+ /* z = R_y^{-1} A x */
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
- cudaMemset(y, 0, A->n * sizeof(scs_float));
-
- cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
- cusparseDnVecSetValues(p->dn_vec_n, (void *) y);
- SCS(_accum_by_atrans_gpu)(
- A, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
-
- CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
+ /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
+ SCS(accum_by_atrans_gpu)
+ (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
+ &p->buffer);
}
-/* M = inv ( diag ( RHO_X * I + A'A ) ) */
-static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
- ScsLinSysWork *p) {
- scs_int i;
- scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
-
-#if EXTRA_VERBOSE > 0
- scs_printf("getting pre-conditioner\n");
-#endif
-
- for (i = 0; i < A->n; ++i) {
- M[i] = 1 / (stgs->rho_x +
- SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
- /* M[i] = 1; */
+/* P comes in upper triangular, expand to full
+ * First compute triplet version of full matrix, then compress to csc
+ * */
+static csc *fill_p_matrix(const ScsMatrix *P) {
+ scs_int i, j, k, kk;
+ scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
+ csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
+ csc *P_full;
+ kk = 0;
+ for (j = 0; j < P->n; j++) { /* cols */
+ for (k = P->p[j]; k < P->p[j + 1]; k++) {
+ i = P->i[k]; /* row */
+ if (i > j) { /* only upper triangular needed */
+ break;
+ }
+ P_tmp->i[kk] = i;
+ P_tmp->p[kk] = j;
+ P_tmp->x[kk] = P->x[k];
+ kk++;
+ if (i == j) { /* diagonal */
+ continue;
+ }
+ P_tmp->i[kk] = j;
+ P_tmp->p[kk] = i;
+ P_tmp->x[kk] = P->x[k];
+ kk++;
+ }
}
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
- scs_free(M);
-
-#if EXTRA_VERBOSE > 0
- scs_printf("finished getting pre-conditioner\n");
-#endif
+ P_tmp->nz = kk; /* set number of nonzeros */
+ P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
+ SCS(cs_spfree)(P_tmp);
+ return P_full;
}
-ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
- const ScsSettings *stgs) {
+ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
+ scs_float *rho_y_vec, scs_float rho_x) {
cudaError_t err;
+ scs_int i;
+ csc *P_full;
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
-
- /* Used for initializing dense vectors */
- scs_float *tmp_null_n = SCS_NULL;
- scs_float *tmp_null_m = SCS_NULL;
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
+ ScsGpuMatrix *Pg = SCS_NULL;
#if GPU_TRANSPOSE_MAT > 0
size_t new_buffer_size = 0;
#endif
+ p->rho_x = rho_x;
p->cublas_handle = 0;
p->cusparse_handle = 0;
- p->total_solve_time = 0;
p->tot_cg_its = 0;
p->buffer_size = 0;
p->buffer = SCS_NULL;
@@ -179,234 +228,291 @@
/* Get handle to the CUSPARSE context */
cusparseCreate(&p->cusparse_handle);
Ag->n = A->n;
Ag->m = A->m;
- Ag->Annz = A->p[A->n];
+ Ag->nnz = A->p[A->n];
Ag->descr = 0;
- /* Matrix description */
-
- p->Ag = Ag;
- p->Agt = SCS_NULL;
-
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
- cudaMalloc((void **)&p->tmp_m,
- A->m * sizeof(scs_float)); /* intermediate result */
+ cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
+ cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
cudaMemcpyHostToDevice);
cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
cudaMemcpyHostToDevice);
- cusparseCreateCsr
- (&Ag->descr, Ag->n, Ag->m, Ag->Annz, Ag->p, Ag->i, Ag->x,
- SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
- CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
+ p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
+ for (i = 0; i < A->m; ++i)
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
+ cudaMemcpyHostToDevice);
- cudaMalloc((void **)&tmp_null_n, A->n * sizeof(scs_float));
- cudaMalloc((void **)&tmp_null_m, A->m * sizeof(scs_float));
- cusparseCreateDnVec(&p->dn_vec_n, Ag->n, tmp_null_n, SCS_CUDA_FLOAT);
- cusparseCreateDnVec(&p->dn_vec_m, Ag->m, tmp_null_m, SCS_CUDA_FLOAT);
- cudaFree(tmp_null_n);
- cudaFree(tmp_null_m);
+ cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
- get_preconditioner(A, stgs, p);
+ if (P) {
+ Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
+ P_full = fill_p_matrix(P);
+ Pg->n = P_full->n;
+ Pg->m = P_full->m;
+ Pg->nnz = P_full->p[P_full->n];
+ Pg->descr = 0;
+ cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
+ cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
+ cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
+ cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
+ cudaMemcpyHostToDevice);
+ cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
+ cudaMemcpyHostToDevice);
+ cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
+ cudaMemcpyHostToDevice);
+
+ cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
+
+ SCS(cs_spfree)(P_full);
+ } else {
+ Pg = SCS_NULL;
+ }
+
+ p->Ag = Ag;
+ p->Pg = Pg;
+ p->Agt = SCS_NULL;
+
+ /* we initialize with tmp_m but always overwrite it so it doesn't matter */
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
+ cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
+
+ set_preconditioner(p, rho_y_vec);
+
#if GPU_TRANSPOSE_MAT > 0
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
p->Agt->n = A->m;
p->Agt->m = A->n;
- p->Agt->Annz = A->p[A->n];
+ p->Agt->nnz = A->p[A->n];
p->Agt->descr = 0;
/* Matrix description */
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
/* transpose Ag into Agt for faster multiplies */
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
- cusparseCsr2cscEx2_bufferSize
- (p->cusparse_handle, A->n, A->m, A->p[A->n],
- Ag->x, Ag->p, Ag->i,
- p->Agt->x, p->Agt->p, p->Agt->i,
- SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
- CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
- &new_buffer_size);
+ cusparseCsr2cscEx2_bufferSize(
+ p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
+ p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
if (new_buffer_size > p->buffer_size) {
if (p->buffer != SCS_NULL) {
cudaFree(p->buffer);
}
cudaMalloc(&p->buffer, new_buffer_size);
p->buffer_size = new_buffer_size;
}
- cusparseCsr2cscEx2
- (p->cusparse_handle, A->n, A->m, A->p[A->n],
- Ag->x, Ag->p, Ag->i,
- p->Agt->x, p->Agt->p, p->Agt->i,
- SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
- CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
- p->buffer);
+ cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
+ Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
+ CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+ SCS_CSR2CSC_ALG, p->buffer);
- cusparseCreateCsr
- (&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->Annz,
- p->Agt->p, p->Agt->i, p->Agt->x,
- SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
- CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
+ cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
+ p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
+ SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
+ SCS_CUDA_FLOAT);
#endif
err = cudaGetLastError();
if (err != cudaSuccess) {
- printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
+ printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
cudaGetErrorString(err));
SCS(free_lin_sys_work)(p);
return SCS_NULL;
}
return p;
}
-static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
- scs_float *z, scs_float *r, scs_int n) {
- cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
- CUBLAS(tbmv)
- (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
- 0, M, 1, z, 1);
-}
-
-/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
-static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
- ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
+/* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
+ * b */
+/* on GPU */
+static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
scs_int max_its, scs_float tol) {
- scs_int i, n = A->n;
- scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
+ scs_int i, n = pr->n;
+ scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
scs_float onef = 1.0, neg_onef = -1.0;
scs_float *p = pr->p; /* cg direction */
scs_float *Gp = pr->Gp; /* updated CG direction */
scs_float *r = pr->r; /* cg residual */
scs_float *z = pr->z; /* preconditioned */
- scs_float *M = pr->M; /* preconditioner */
cublasHandle_t cublas_handle = pr->cublas_handle;
- if (s == SCS_NULL) {
+ if (!s) {
+ /* take s = 0 */
+ /* r = b */
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
+ /* b = 0 */
cudaMemset(bg, 0, n * sizeof(scs_float));
} else {
/* p contains bg temporarily */
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
- /* bg contains s */
+ /* bg = s */
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
- mat_vec(A, stgs, pr, bg, r);
+ /* r = Mat * s */
+ mat_vec(pr, bg, r);
+ /* r = Mat * s - b */
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
+ /* r = b - Mat * s */
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
}
- /* for some reason nrm2 is VERY slow */
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
- nrm_r = SQRTF(nrm_r);
/* check to see if we need to run CG at all */
- if (nrm_r < MIN(tol, 1e-18)) {
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
return 0;
}
- apply_pre_conditioner(cublas_handle, M, z, r, n);
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
- /* put z in p, replacing temp mem */
+ /* z = M r */
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
+ scale_by_diag(cublas_handle, pr->M, z, n);
+ /* ztr = z'r */
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
+ /* p = z */
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
for (i = 0; i < max_its; ++i) {
- mat_vec(A, stgs, pr, p, Gp);
-
- CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
-
- alpha = ipzr / p_gp;
+ /* Gp = Mat * p */
+ mat_vec(pr, p, Gp);
+ /* ptGp = p'Gp */
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
+ /* alpha = z'r / p'G p */
+ alpha = ztr / ptGp;
neg_alpha = -alpha;
-
+ /* b += alpha * p */
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
+ /* r -= alpha * G p */
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
- /* for some reason nrm2 is VERY slow */
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
- nrm_r = SQRTF(nrm_r);
- if (nrm_r < tol) {
- i++;
- break;
- }
- ipzr_old = ipzr;
- apply_pre_conditioner(cublas_handle, M, z, r, n);
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
+#if VERBOSITY > 3
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
+ cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
+#endif
- beta = ipzr / ipzr_old;
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
+ return i + 1;
+ }
+ /* z = M r */
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
+ scale_by_diag(cublas_handle, pr->M, z, n);
+ ztr_prev = ztr;
+ /* ztr = z'r */
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
+ beta = ztr / ztr_prev;
+ /* p = beta * p, where beta = ztr / ztr_prev */
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
+ /* p = z + beta * p */
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
}
-#if EXTRA_VERBOSE > 0
- scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
-#endif
return i;
}
-scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
- ScsLinSysWork *p, scs_float *b, const scs_float *s,
- scs_int iter) {
- scs_int cg_its;
- SCS(timer) linsys_timer;
- scs_float *bg = p->bg;
+/* solves Mx = b, for x but stores result in b */
+/* s contains warm-start (if available) */
+/*
+ * [x] = [rho_x I + P A' ]^{-1} [rx]
+ * [y] [ A -R_y ] [ry]
+ *
+ * R_y = diag(rho_y_vec)
+ *
+ * becomes:
+ *
+ * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
+ * y = R_y^{-1} (Ax - ry)
+ *
+ */
+scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
+ scs_float tol) {
+ scs_int cg_its, max_iters;
scs_float neg_onef = -1.0;
+
+ /* these are on GPU */
+ scs_float *bg = p->bg;
+ scs_float *tmp_m = p->tmp_m;
ScsGpuMatrix *Ag = p->Ag;
- scs_float cg_tol =
- SCS(norm)(b, Ag->n) *
- (iter < 0 ? CG_BEST_TOL
- : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
- SCS(tic)(&linsys_timer);
- /* all on GPU */
- cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
+ ScsGpuMatrix *Pg = p->Pg;
- cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
- cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
- SCS(_accum_by_atrans_gpu)(
- Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
+ if (CG_NORM(b, p->n + p->m) <= 1e-12) {
+ memset(b, 0, (p->n + p->m) * sizeof(scs_float));
+ return 0;
+ }
- /* solves (I+A'A)x = b, s warm start, solution stored in b */
- cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
+ if (tol <= 0.) {
+ scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
+ "INDIRECT flag.\n",
+ tol);
+ }
+
+ /* bg = b = [rx; ry] */
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
+ cudaMemcpyHostToDevice);
+ /* tmp = ry */
+ cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
+ cudaMemcpyDeviceToDevice);
+ /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
+
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
+ /* bg[:n] = rx + A' R ry */
+ SCS(accum_by_atrans_gpu)
+ (Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
+ &p->buffer);
+
+ /* set max_iters to 10 * n (though in theory n is enough for any tol) */
+ max_iters = 10 * Ag->n;
+
+ /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
+ * in bg */
+ cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
+
+ /* bg[n:] = -ry */
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
- cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
- cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
+ /* b[n:] = Ax - ry */
#if GPU_TRANSPOSE_MAT > 0
- SCS(_accum_by_atrans_gpu)(
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
+ SCS(accum_by_atrans_gpu)
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
+ &p->buffer);
#else
- SCS(_accum_by_a_gpu)(
- Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
- &p->buffer_size, &p->buffer
- );
+ SCS(accum_by_a_gpu)
+ (Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
+ &p->buffer);
#endif
- cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
+ /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
- if (iter >= 0) {
- p->tot_cg_its += cg_its;
- }
-
- p->total_solve_time += SCS(tocq)(&linsys_timer);
-#if EXTRAVERBOSE > 0
- scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
+ /* copy bg = [x; y] back to b */
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
+ cudaMemcpyDeviceToHost);
+ p->tot_cg_its += cg_its;
+#if VERBOSITY > 1
+ scs_printf("tol %.3e\n", tol);
+ scs_printf("cg_its %i\n", (int)cg_its);
#endif
return 0;
}