vendor/scs/linsys/gpu/indirect/private.c in scs-0.2.2 vs vendor/scs/linsys/gpu/indirect/private.c in scs-0.2.3

- old
+ new

@@ -8,26 +8,43 @@ const scs_float *x, scs_float *y) { scs_float *v_m = p->tmp_m; scs_float *v_n = p->r; cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice); cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice); - SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle); + + cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m); + cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n); + SCS(_accum_by_atrans_gpu)( + p->Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); + cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost); } /* do not use within pcg, reuses memory */ void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x, scs_float *y) { scs_float *v_m = p->tmp_m; scs_float *v_n = p->r; cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice); cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice); + + cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m); + cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n); #if GPU_TRANSPOSE_MAT > 0 - SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle); + SCS(_accum_by_atrans_gpu)( + p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); #else - SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle); + SCS(_accum_by_a_gpu)( + p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); #endif + cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost); } char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) { char *str = (char *)scs_malloc(sizeof(char) * 128); @@ -62,10 +79,15 @@ } if (p->Agt) { SCS(free_gpu_matrix)(p->Agt); scs_free(p->Agt); } + if (p->buffer != SCS_NULL) { + cudaFree(p->buffer); + } + cusparseDestroyDnVec(p->dn_vec_m); + cusparseDestroyDnVec(p->dn_vec_n); cusparseDestroy(p->cusparse_handle); cublasDestroy(p->cublas_handle); /* Don't reset because it interferes with other GPU programs. */ /* cudaDeviceReset(); */ scs_free(p); @@ -76,13 +98,34 @@ static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s, ScsLinSysWork *p, const scs_float *x, scs_float *y) { /* x and y MUST already be loaded to GPU */ scs_float *tmp_m = p->tmp_m; /* temp memory */ cudaMemset(tmp_m, 0, A->m * sizeof(scs_float)); - SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle); + + cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m); + cusparseDnVecSetValues(p->dn_vec_n, (void *) x); +#if GPU_TRANSPOSE_MAT > 0 + SCS(_accum_by_atrans_gpu)( + p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); +#else + SCS(_accum_by_a_gpu)( + A, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); +#endif + cudaMemset(y, 0, A->n * sizeof(scs_float)); - SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle); + + cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m); + cusparseDnVecSetValues(p->dn_vec_n, (void *) y); + SCS(_accum_by_atrans_gpu)( + A, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); + CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1); } /* M = inv ( diag ( RHO_X * I + A'A ) ) */ static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs, @@ -110,17 +153,28 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsSettings *stgs) { cudaError_t err; ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork)); ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix)); + + /* Used for initializing dense vectors */ + scs_float *tmp_null_n = SCS_NULL; + scs_float *tmp_null_m = SCS_NULL; +#if GPU_TRANSPOSE_MAT > 0 + size_t new_buffer_size = 0; +#endif + p->cublas_handle = 0; p->cusparse_handle = 0; p->total_solve_time = 0; p->tot_cg_its = 0; + p->buffer_size = 0; + p->buffer = SCS_NULL; + /* Get handle to the CUBLAS context */ cublasCreate(&p->cublas_handle); /* Get handle to the CUSPARSE context */ cusparseCreate(&p->cusparse_handle); @@ -128,13 +182,11 @@ Ag->n = A->n; Ag->m = A->m; Ag->Annz = A->p[A->n]; Ag->descr = 0; /* Matrix description */ - cusparseCreateMatDescr(&Ag->descr); - cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO); + p->Ag = Ag; p->Agt = SCS_NULL; cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int)); cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int)); @@ -153,31 +205,66 @@ cudaMemcpyHostToDevice); cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice); cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float), cudaMemcpyHostToDevice); + cusparseCreateCsr + (&Ag->descr, Ag->n, Ag->m, Ag->Annz, Ag->p, Ag->i, Ag->x, + SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX, + CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT); + + cudaMalloc((void **)&tmp_null_n, A->n * sizeof(scs_float)); + cudaMalloc((void **)&tmp_null_m, A->m * sizeof(scs_float)); + cusparseCreateDnVec(&p->dn_vec_n, Ag->n, tmp_null_n, SCS_CUDA_FLOAT); + cusparseCreateDnVec(&p->dn_vec_m, Ag->m, tmp_null_m, SCS_CUDA_FLOAT); + cudaFree(tmp_null_n); + cudaFree(tmp_null_m); + get_preconditioner(A, stgs, p); #if GPU_TRANSPOSE_MAT > 0 p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix)); p->Agt->n = A->m; p->Agt->m = A->n; p->Agt->Annz = A->p[A->n]; p->Agt->descr = 0; /* Matrix description */ - cusparseCreateMatDescr(&p->Agt->descr); - cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO); cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int)); cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int)); cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float)); /* transpose Ag into Agt for faster multiplies */ /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */ - CUSPARSE(csr2csc) - (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x, - p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO); + cusparseCsr2cscEx2_bufferSize + (p->cusparse_handle, A->n, A->m, A->p[A->n], + Ag->x, Ag->p, Ag->i, + p->Agt->x, p->Agt->p, p->Agt->i, + SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, + &new_buffer_size); + + if (new_buffer_size > p->buffer_size) { + if (p->buffer != SCS_NULL) { + cudaFree(p->buffer); + } + cudaMalloc(&p->buffer, new_buffer_size); + p->buffer_size = new_buffer_size; + } + + cusparseCsr2cscEx2 + (p->cusparse_handle, A->n, A->m, A->p[A->n], + Ag->x, Ag->p, Ag->i, + p->Agt->x, p->Agt->p, p->Agt->i, + SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, + p->buffer); + + cusparseCreateCsr + (&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->Annz, + p->Agt->p, p->Agt->i, p->Agt->x, + SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX, + CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT); #endif err = cudaGetLastError(); if (err != cudaSuccess) { printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, @@ -283,14 +370,35 @@ (iter < 0 ? CG_BEST_TOL : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate)); SCS(tic)(&linsys_timer); /* all on GPU */ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice); - SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle); + + cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n])); + cusparseDnVecSetValues(p->dn_vec_n, (void *) bg); + SCS(_accum_by_atrans_gpu)( + Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); + /* solves (I+A'A)x = b, s warm start, solution stored in b */ cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL)); CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1); - SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle); + + cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n])); + cusparseDnVecSetValues(p->dn_vec_n, (void *) bg); +#if GPU_TRANSPOSE_MAT > 0 + SCS(_accum_by_atrans_gpu)( + p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); +#else + SCS(_accum_by_a_gpu)( + Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, + &p->buffer_size, &p->buffer + ); +#endif + cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost); if (iter >= 0) { p->tot_cg_its += cg_its; }