vendor/scs/linsys/gpu/indirect/private.c in scs-0.2.2 vs vendor/scs/linsys/gpu/indirect/private.c in scs-0.2.3
- old
+ new
@@ -8,26 +8,43 @@
const scs_float *x, scs_float *y) {
scs_float *v_m = p->tmp_m;
scs_float *v_n = p->r;
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
- SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
+
+ cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
+ cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
+ SCS(_accum_by_atrans_gpu)(
+ p->Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
+
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
}
/* do not use within pcg, reuses memory */
void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
scs_float *y) {
scs_float *v_m = p->tmp_m;
scs_float *v_n = p->r;
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
+
+ cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
+ cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
#if GPU_TRANSPOSE_MAT > 0
- SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
+ SCS(_accum_by_atrans_gpu)(
+ p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
#else
- SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
+ SCS(_accum_by_a_gpu)(
+ p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
#endif
+
cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
}
char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
char *str = (char *)scs_malloc(sizeof(char) * 128);
@@ -62,10 +79,15 @@
}
if (p->Agt) {
SCS(free_gpu_matrix)(p->Agt);
scs_free(p->Agt);
}
+ if (p->buffer != SCS_NULL) {
+ cudaFree(p->buffer);
+ }
+ cusparseDestroyDnVec(p->dn_vec_m);
+ cusparseDestroyDnVec(p->dn_vec_n);
cusparseDestroy(p->cusparse_handle);
cublasDestroy(p->cublas_handle);
/* Don't reset because it interferes with other GPU programs. */
/* cudaDeviceReset(); */
scs_free(p);
@@ -76,13 +98,34 @@
static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
ScsLinSysWork *p, const scs_float *x, scs_float *y) {
/* x and y MUST already be loaded to GPU */
scs_float *tmp_m = p->tmp_m; /* temp memory */
cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
- SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
+
+ cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
+ cusparseDnVecSetValues(p->dn_vec_n, (void *) x);
+#if GPU_TRANSPOSE_MAT > 0
+ SCS(_accum_by_atrans_gpu)(
+ p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
+#else
+ SCS(_accum_by_a_gpu)(
+ A, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
+#endif
+
cudaMemset(y, 0, A->n * sizeof(scs_float));
- SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
+
+ cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
+ cusparseDnVecSetValues(p->dn_vec_n, (void *) y);
+ SCS(_accum_by_atrans_gpu)(
+ A, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
+
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
}
/* M = inv ( diag ( RHO_X * I + A'A ) ) */
static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
@@ -110,17 +153,28 @@
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
const ScsSettings *stgs) {
cudaError_t err;
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
+
+ /* Used for initializing dense vectors */
+ scs_float *tmp_null_n = SCS_NULL;
+ scs_float *tmp_null_m = SCS_NULL;
+#if GPU_TRANSPOSE_MAT > 0
+ size_t new_buffer_size = 0;
+#endif
+
p->cublas_handle = 0;
p->cusparse_handle = 0;
p->total_solve_time = 0;
p->tot_cg_its = 0;
+ p->buffer_size = 0;
+ p->buffer = SCS_NULL;
+
/* Get handle to the CUBLAS context */
cublasCreate(&p->cublas_handle);
/* Get handle to the CUSPARSE context */
cusparseCreate(&p->cusparse_handle);
@@ -128,13 +182,11 @@
Ag->n = A->n;
Ag->m = A->m;
Ag->Annz = A->p[A->n];
Ag->descr = 0;
/* Matrix description */
- cusparseCreateMatDescr(&Ag->descr);
- cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
- cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
+
p->Ag = Ag;
p->Agt = SCS_NULL;
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
@@ -153,31 +205,66 @@
cudaMemcpyHostToDevice);
cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
cudaMemcpyHostToDevice);
+ cusparseCreateCsr
+ (&Ag->descr, Ag->n, Ag->m, Ag->Annz, Ag->p, Ag->i, Ag->x,
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
+
+ cudaMalloc((void **)&tmp_null_n, A->n * sizeof(scs_float));
+ cudaMalloc((void **)&tmp_null_m, A->m * sizeof(scs_float));
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, tmp_null_n, SCS_CUDA_FLOAT);
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, tmp_null_m, SCS_CUDA_FLOAT);
+ cudaFree(tmp_null_n);
+ cudaFree(tmp_null_m);
+
get_preconditioner(A, stgs, p);
#if GPU_TRANSPOSE_MAT > 0
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
p->Agt->n = A->m;
p->Agt->m = A->n;
p->Agt->Annz = A->p[A->n];
p->Agt->descr = 0;
/* Matrix description */
- cusparseCreateMatDescr(&p->Agt->descr);
- cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
- cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
/* transpose Ag into Agt for faster multiplies */
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
- CUSPARSE(csr2csc)
- (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
- p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
+ cusparseCsr2cscEx2_bufferSize
+ (p->cusparse_handle, A->n, A->m, A->p[A->n],
+ Ag->x, Ag->p, Ag->i,
+ p->Agt->x, p->Agt->p, p->Agt->i,
+ SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
+ &new_buffer_size);
+
+ if (new_buffer_size > p->buffer_size) {
+ if (p->buffer != SCS_NULL) {
+ cudaFree(p->buffer);
+ }
+ cudaMalloc(&p->buffer, new_buffer_size);
+ p->buffer_size = new_buffer_size;
+ }
+
+ cusparseCsr2cscEx2
+ (p->cusparse_handle, A->n, A->m, A->p[A->n],
+ Ag->x, Ag->p, Ag->i,
+ p->Agt->x, p->Agt->p, p->Agt->i,
+ SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
+ p->buffer);
+
+ cusparseCreateCsr
+ (&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->Annz,
+ p->Agt->p, p->Agt->i, p->Agt->x,
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
#endif
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
@@ -283,14 +370,35 @@
(iter < 0 ? CG_BEST_TOL
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
SCS(tic)(&linsys_timer);
/* all on GPU */
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
- SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
+
+ cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
+ cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
+ SCS(_accum_by_atrans_gpu)(
+ Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
+
/* solves (I+A'A)x = b, s warm start, solution stored in b */
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
- SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
+
+ cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
+ cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
+#if GPU_TRANSPOSE_MAT > 0
+ SCS(_accum_by_atrans_gpu)(
+ p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
+#else
+ SCS(_accum_by_a_gpu)(
+ Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
+ &p->buffer_size, &p->buffer
+ );
+#endif
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
if (iter >= 0) {
p->tot_cg_its += cg_its;
}