vendor/scs/linsys/gpu/indirect/private.c in scs-0.3.1 vs vendor/scs/linsys/gpu/indirect/private.c in scs-0.3.2
- old
+ new
@@ -33,67 +33,81 @@
p->tot_cg_its = 0;
return str;
}
*/
-/* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
-static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
+/* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
+/* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
+static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
scs_int i, k;
const ScsMatrix *A = p->A;
const ScsMatrix *P = p->P;
- scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
+ scs_float *M = p->M;
#if VERBOSITY > 0
scs_printf("getting pre-conditioner\n");
#endif
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
for (i = 0; i < A->n; ++i) { /* cols */
- M[i] = p->rho_x;
- /* diag(A' R_y^{-1} A) */
+ /* M_ii = (R_x)_i */
+ M[i] = diag_r[i];
+ /* M_ii += a_i' (R_y)^-1 a_i */
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
/* A->i[k] is row of entry k with value A->x[k] */
- M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
+ M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
}
if (P) {
for (k = P->p[i]; k < P->p[i + 1]; k++) {
/* diagonal element only */
if (P->i[k] == i) { /* row == col */
+ /* M_ii += P_ii */
M[i] += P->x[k];
break;
}
}
}
+ /* finally invert for pre-conditioner */
M[i] = 1. / M[i];
}
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
- scs_free(M);
+ cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
#if VERBOSITY > 0
scs_printf("finished getting pre-conditioner\n");
#endif
}
/* no need to update anything in this case */
-void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
+void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
scs_int i;
+
+ /* R_x to gpu */
+ cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
+ cudaMemcpyHostToDevice);
+
+ /* 1/R_y to gpu */
for (i = 0; i < p->m; ++i)
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
+ p->inv_r_y[i] = 1. / diag_r[p->n + i];
+ cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
cudaMemcpyHostToDevice);
- set_preconditioner(p, rho_y_vec);
+
+ /* set preconditioner M on gpu */
+ set_preconditioner(p, diag_r);
}
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
if (p) {
- scs_free(p->inv_rho_y_vec);
+ scs_free(p->M);
+ scs_free(p->inv_r_y);
cudaFree(p->p);
cudaFree(p->r);
cudaFree(p->Gp);
cudaFree(p->bg);
cudaFree(p->tmp_m);
cudaFree(p->z);
- cudaFree(p->M);
- cudaFree(p->inv_rho_y_vec_gpu);
+ cudaFree(p->M_gpu);
+ cudaFree(p->r_x_gpu);
+ cudaFree(p->inv_r_y_gpu);
if (p->Pg) {
SCS(free_gpu_matrix)(p->Pg);
scs_free(p->Pg);
}
if (p->Ag) {
@@ -124,26 +138,27 @@
CUBLAS(tbmv)
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
0, M, 1, z, 1);
}
-/* y = (rho_x * I + P + A' R_y^{-1} A) x */
+/* y = (R_x + P + A' R_y^{-1} A) x */
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
/* x and y MUST already be loaded to GPU */
scs_float *z = p->tmp_m; /* temp memory */
- cudaMemset(y, 0, p->n * sizeof(scs_float));
cudaMemset(z, 0, p->m * sizeof(scs_float));
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
- /* y = rho_x * x */
- CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
+ /* y = x */
+ cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
+ /* y = R_x * x */
+ scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
if (p->Pg) {
- /* y = rho_x * x + Px */
+ /* y = R_x * x + P x */
SCS(accum_by_p_gpu)
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
&p->buffer);
}
@@ -156,13 +171,13 @@
SCS(accum_by_a_gpu)
(p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
&p->buffer);
#endif
/* z = R_y^{-1} A x */
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
- /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
+ /* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
SCS(accum_by_atrans_gpu)
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
&p->buffer);
}
@@ -199,23 +214,39 @@
SCS(cs_spfree)(P_tmp);
return P_full;
}
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
- scs_float *rho_y_vec, scs_float rho_x) {
+ const scs_float *diag_r) {
cudaError_t err;
- scs_int i;
csc *P_full;
- ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
+ ScsLinSysWork *p = SCS_NULL;
+ ScsGpuMatrix *Ag = SCS_NULL;
ScsGpuMatrix *Pg = SCS_NULL;
+ int device_count;
+ err = cudaGetDeviceCount(&device_count);
+ if (err > 0) {
+ scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
+ return SCS_NULL;
+ }
+
+ p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
+ Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
+
+ p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
+ p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
+
+ p->A = A;
+ p->P = P;
+ p->m = A->m;
+ p->n = A->n;
+
#if GPU_TRANSPOSE_MAT > 0
size_t new_buffer_size = 0;
#endif
- p->rho_x = rho_x;
p->cublas_handle = 0;
p->cusparse_handle = 0;
p->tot_cg_its = 0;
@@ -240,25 +271,20 @@
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
- cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
- cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
+ cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
+ cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
+ cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
cudaMemcpyHostToDevice);
cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
cudaMemcpyHostToDevice);
- p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
- for (i = 0; i < A->m; ++i)
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
- cudaMemcpyHostToDevice);
-
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
if (P) {
@@ -295,11 +321,12 @@
/* we initialize with tmp_m but always overwrite it so it doesn't matter */
cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
- set_preconditioner(p, rho_y_vec);
+ /* Form preconditioner and copy R_x, 1/R_y to gpu */
+ SCS(update_lin_sys_diag_r)(p, diag_r);
#if GPU_TRANSPOSE_MAT > 0
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
p->Agt->n = A->m;
p->Agt->m = A->n;
@@ -344,13 +371,12 @@
return SCS_NULL;
}
return p;
}
-/* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
- * b */
-/* on GPU */
+/* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
+ * b, on GPU */
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
scs_int max_its, scs_float tol) {
scs_int i, n = pr->n;
scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
scs_float onef = 1.0, neg_onef = -1.0;
@@ -384,11 +410,11 @@
return 0;
}
/* z = M r */
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
- scale_by_diag(cublas_handle, pr->M, z, n);
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
/* ztr = z'r */
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
/* p = z */
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
@@ -413,11 +439,11 @@
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
return i + 1;
}
/* z = M r */
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
- scale_by_diag(cublas_handle, pr->M, z, n);
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
ztr_prev = ztr;
/* ztr = z'r */
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
beta = ztr / ztr_prev;
/* p = beta * p, where beta = ztr / ztr_prev */
@@ -429,18 +455,16 @@
}
/* solves Mx = b, for x but stores result in b */
/* s contains warm-start (if available) */
/*
- * [x] = [rho_x I + P A' ]^{-1} [rx]
+ * [x] = [R_x + P A' ]^{-1} [rx]
* [y] [ A -R_y ] [ry]
*
- * R_y = diag(rho_y_vec)
- *
* becomes:
*
- * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
* y = R_y^{-1} (Ax - ry)
*
*/
scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
scs_float tol) {
@@ -449,11 +473,10 @@
/* these are on GPU */
scs_float *bg = p->bg;
scs_float *tmp_m = p->tmp_m;
ScsGpuMatrix *Ag = p->Ag;
- ScsGpuMatrix *Pg = p->Pg;
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
return 0;
}
@@ -469,11 +492,11 @@
cudaMemcpyHostToDevice);
/* tmp = ry */
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
cudaMemcpyDeviceToDevice);
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
/* bg[:n] = rx + A' R ry */
SCS(accum_by_atrans_gpu)
@@ -481,11 +504,11 @@
&p->buffer);
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
max_iters = 10 * Ag->n;
- /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
+ /* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
* in bg */
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
/* bg[n:] = -ry */
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
@@ -502,10 +525,10 @@
(Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
&p->buffer);
#endif
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
/* copy bg = [x; y] back to b */
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
cudaMemcpyDeviceToHost);
p->tot_cg_its += cg_its;