/* * Anderson acceleration. * * x: input iterate * x_prev: previous input iterate * f: f(x) output of map f applied to x * g: x - f (error) * g_prev: previous error * s: x - x_prev * y: g - g_prev * d: s - y = f - f_prev * * capital letters are the variables stacked columnwise * idx tracks current index where latest quantities written * idx cycles from left to right columns in matrix * * Type-I: * return f = f - (S - Y) * ( S'Y + r I)^{-1} ( S'g ) * * Type-II: * return f = f - (S - Y) * ( Y'Y + r I)^{-1} ( Y'g ) * */ #include "aa.h" #include "scs_blas.h" #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #define FILL_MEMORY_BEFORE_SOLVE (1) #ifndef USE_LAPACK typedef void *ACCEL_WORK; AaWork *aa_init(aa_int dim, aa_int mem, aa_int type1, aa_float regularization, aa_float relaxation, aa_float safeguard_factor, aa_float max_weight_norm, aa_int verbosity) { return SCS_NULL; } aa_float aa_apply(aa_float *f, const aa_float *x, AaWork *a) { return 0; } aa_int aa_safeguard(aa_float *f_new, aa_float *x_new, AaWork *a) { return 0; } void aa_finish(AaWork *a) { } void aa_reset(AaWork *a) { } #else #if PROFILING > 0 #define TIME_TIC \ timer __t; \ tic(&__t); #define TIME_TOC toc(__func__, &__t); #include typedef struct timer { struct timespec tic; struct timespec toc; } timer; void tic(timer *t) { clock_gettime(CLOCK_MONOTONIC, &t->tic); } aa_float tocq(timer *t) { struct timespec temp; clock_gettime(CLOCK_MONOTONIC, &t->toc); if ((t->toc.tv_nsec - t->tic.tv_nsec) < 0) { temp.tv_sec = t->toc.tv_sec - t->tic.tv_sec - 1; temp.tv_nsec = 1e9 + t->toc.tv_nsec - t->tic.tv_nsec; } else { temp.tv_sec = t->toc.tv_sec - t->tic.tv_sec; temp.tv_nsec = t->toc.tv_nsec - t->tic.tv_nsec; } return (aa_float)temp.tv_sec * 1e3 + (aa_float)temp.tv_nsec / 1e6; } aa_float toc(const char *str, timer *t) { aa_float time = tocq(t); printf("%s - time: %8.4f milli-seconds.\n", str, time); return time; } #else #define TIME_TIC #define TIME_TOC #endif #ifdef __cplusplus extern "C" { #endif /* BLAS functions used */ aa_float BLAS(nrm2)(blas_int *n, aa_float *x, blas_int *incx); void BLAS(axpy)(blas_int *n, aa_float *a, const aa_float *x, blas_int *incx, aa_float *y, blas_int *incy); void BLAS(gemv)(const char *trans, const blas_int *m, const blas_int *n, const aa_float *alpha, const aa_float *a, const blas_int *lda, const aa_float *x, const blas_int *incx, const aa_float *beta, aa_float *y, const blas_int *incy); void BLAS(gesv)(blas_int *n, blas_int *nrhs, aa_float *a, blas_int *lda, blas_int *ipiv, aa_float *b, blas_int *ldb, blas_int *info); void BLAS(gemm)(const char *transa, const char *transb, blas_int *m, blas_int *n, blas_int *k, aa_float *alpha, aa_float *a, blas_int *lda, aa_float *b, blas_int *ldb, aa_float *beta, aa_float *c, blas_int *ldc); void BLAS(scal)(const blas_int *n, const aa_float *a, aa_float *x, const blas_int *incx); #ifdef __cplusplus } #endif /* This file uses Anderson acceleration to improve the convergence of * a fixed point mapping. * At each iteration we need to solve a (small) linear system, we * do this using LAPACK ?gesv. */ /* contains the necessary parameters to perform aa at each step */ struct ACCEL_WORK { aa_int type1; /* bool, if true type 1 aa otherwise type 2 */ aa_int mem; /* aa memory */ aa_int dim; /* variable dimension */ aa_int iter; /* current iteration */ aa_int verbosity; /* verbosity level, 0 is no printing */ aa_int success; /* was the last AA step successful or not */ aa_float relaxation; /* relaxation x and f, beta in some papers */ aa_float regularization; /* regularization */ aa_float safeguard_factor; /* safeguard tolerance factor */ aa_float max_weight_norm; /* maximum norm of AA weights */ aa_float *x; /* x input to map*/ aa_float *f; /* f(x) output of map */ aa_float *g; /* x - f(x) */ aa_float norm_g; /* ||x - f(x)|| */ /* from previous iteration */ aa_float *g_prev; /* x_prev - f(x_prev) */ aa_float *y; /* g - g_prev */ aa_float *s; /* x - x_prev */ aa_float *d; /* f - f_prev */ aa_float *Y; /* matrix of stacked y values */ aa_float *S; /* matrix of stacked s values */ aa_float *D; /* matrix of stacked d values = (S-Y) */ aa_float *M; /* S'Y or Y'Y depending on type of aa */ /* workspace variables */ aa_float *work; /* scratch space */ blas_int *ipiv; /* permutation variable, not used after solve */ aa_float *x_work; /* workspace (= x) for when relaxation != 1.0 */ }; /* add regularization dependent on Y and S matrices */ static aa_float compute_regularization(AaWork *a, aa_int len) { /* typically type-I does better with higher regularization than type-II */ TIME_TIC aa_float r, nrm_m; blas_int btotal = (blas_int)(len * len), one = 1; nrm_m = BLAS(nrm2)(&btotal, a->M, &one); r = a->regularization * nrm_m; if (a->verbosity > 2) { printf("iter: %i, norm: M %.2e, r: %.2e\n", (int)a->iter, nrm_m, r); } TIME_TOC return r; } /* sets a->M to S'Y or Y'Y depending on type of aa used */ /* M is len x len after this */ static void set_m(AaWork *a, aa_int len) { TIME_TIC aa_int i; blas_int bdim = (blas_int)(a->dim); blas_int blen = (blas_int)len; aa_float onef = 1.0, zerof = 0.0, r; /* if len < mem this only uses len cols */ BLAS(gemm) ("Trans", "No", &blen, &blen, &bdim, &onef, a->type1 ? a->S : a->Y, &bdim, a->Y, &bdim, &zerof, a->M, &blen); if (a->regularization > 0) { r = compute_regularization(a, len); for (i = 0; i < len; ++i) { a->M[i + len * i] += r; } } TIME_TOC return; } /* initialize accel params, in particular x_prev, f_prev, g_prev */ static void init_accel_params(const aa_float *x, const aa_float *f, AaWork *a) { TIME_TIC blas_int bdim = (blas_int)a->dim; aa_float neg_onef = -1.0; blas_int one = 1; /* x_prev = x */ memcpy(a->x, x, sizeof(aa_float) * a->dim); /* f_prev = f */ memcpy(a->f, f, sizeof(aa_float) * a->dim); /* g_prev = x */ memcpy(a->g_prev, x, sizeof(aa_float) * a->dim); /* g_prev = x_prev - f_prev */ BLAS(axpy)(&bdim, &neg_onef, f, &one, a->g_prev, &one); TIME_TOC } /* updates the workspace parameters for aa for this iteration */ static void update_accel_params(const aa_float *x, const aa_float *f, AaWork *a, aa_int len) { /* at the start a->x = x_prev and a->f = f_prev */ TIME_TIC aa_int idx = (a->iter - 1) % a->mem; blas_int one = 1; blas_int bdim = (blas_int)a->dim; aa_float neg_onef = -1.0; /* g = x */ memcpy(a->g, x, sizeof(aa_float) * a->dim); /* s = x */ memcpy(a->s, x, sizeof(aa_float) * a->dim); /* d = f */ memcpy(a->d, f, sizeof(aa_float) * a->dim); /* g = x - f */ BLAS(axpy)(&bdim, &neg_onef, f, &one, a->g, &one); /* s = x - x_prev */ BLAS(axpy)(&bdim, &neg_onef, a->x, &one, a->s, &one); /* d = f - f_prev */ BLAS(axpy)(&bdim, &neg_onef, a->f, &one, a->d, &one); /* g, s, d correct here */ /* y = g */ memcpy(a->y, a->g, sizeof(aa_float) * a->dim); /* y = g - g_prev */ BLAS(axpy)(&bdim, &neg_onef, a->g_prev, &one, a->y, &one); /* y correct here */ /* copy y into idx col of Y */ memcpy(&(a->Y[idx * a->dim]), a->y, sizeof(aa_float) * a->dim); /* copy s into idx col of S */ memcpy(&(a->S[idx * a->dim]), a->s, sizeof(aa_float) * a->dim); /* copy d into idx col of D */ memcpy(&(a->D[idx * a->dim]), a->d, sizeof(aa_float) * a->dim); /* Y, S, D correct here */ /* set a->f and a->x for next iter (x_prev and f_prev) */ memcpy(a->f, f, sizeof(aa_float) * a->dim); memcpy(a->x, x, sizeof(aa_float) * a->dim); /* workspace for when relaxation != 1.0 */ if (a->x_work) { memcpy(a->x_work, x, sizeof(aa_float) * a->dim); } /* x, f correct here */ memcpy(a->g_prev, a->g, sizeof(aa_float) * a->dim); /* g_prev set for next iter here */ /* compute ||g|| = ||f - x|| */ a->norm_g = BLAS(nrm2)(&bdim, a->g, &one); TIME_TOC return; } /* f = (1-relaxation) * \sum_i a_i x_i + relaxation * \sum_i a_i f_i */ static void relax(aa_float *f, AaWork *a, aa_int len) { TIME_TIC /* x_work = x initially */ blas_int bdim = (blas_int)(a->dim), one = 1, blen = (blas_int)len; aa_float onef = 1.0, neg_onef = -1.0; aa_float one_m_relaxation = 1. - a->relaxation; /* x_work = x - S * work */ BLAS(gemv) ("NoTrans", &bdim, &blen, &neg_onef, a->S, &bdim, a->work, &one, &onef, a->x_work, &one); /* f = relaxation * f */ BLAS(scal)(&bdim, &a->relaxation, f, &one); /* f += (1 - relaxation) * x_work */ BLAS(axpy)(&bdim, &one_m_relaxation, a->x_work, &one, f, &one); TIME_TOC } /* solves the system of equations to perform the AA update * at the end f contains the next iterate to be returned */ static aa_float solve(aa_float *f, AaWork *a, aa_int len) { TIME_TIC blas_int info = -1, bdim = (blas_int)(a->dim), one = 1, blen = (blas_int)len; aa_float onef = 1.0, zerof = 0.0, neg_onef = -1.0, aa_norm; /* work = S'g or Y'g */ BLAS(gemv) ("Trans", &bdim, &blen, &onef, a->type1 ? a->S : a->Y, &bdim, a->g, &one, &zerof, a->work, &one); /* work = M \ work, where update_accel_params has set M = S'Y or M = Y'Y */ BLAS(gesv)(&blen, &one, a->M, &blen, a->ipiv, a->work, &blen, &info); aa_norm = BLAS(nrm2)(&blen, a->work, &one); if (a->verbosity > 1) { printf("AA type %i, iter: %i, len %i, info: %i, aa_norm %.2e\n", a->type1 ? 1 : 2, (int)a->iter, (int)len, (int)info, aa_norm); } /* info < 0 input error, input > 0 matrix is singular */ if (info != 0 || aa_norm >= a->max_weight_norm) { if (a->verbosity > 0) { printf("Error in AA type %i, iter: %i, len %i, info: %i, aa_norm %.2e\n", a->type1 ? 1 : 2, (int)a->iter, (int)len, (int)info, aa_norm); } a->success = 0; /* reset aa for stability */ aa_reset(a); TIME_TOC return -aa_norm; } /* here work = gamma, ie, the correct AA shifted weights */ /* if solve was successful compute new point */ /* first set f -= D * work */ BLAS(gemv) ("NoTrans", &bdim, &blen, &neg_onef, a->D, &bdim, a->work, &one, &onef, f, &one); /* if relaxation is not 1 then need to incorporate */ if (a->relaxation != 1.0) { relax(f, a, len); } a->success = 1; /* this should be the only place we set success = 1 */ TIME_TOC return aa_norm; } /* * API functions below this line, see aa.h for descriptions. */ AaWork *aa_init(aa_int dim, aa_int mem, aa_int type1, aa_float regularization, aa_float relaxation, aa_float safeguard_factor, aa_float max_weight_norm, aa_int verbosity) { TIME_TIC AaWork *a = (AaWork *)calloc(1, sizeof(AaWork)); if (!a) { printf("Failed to allocate memory for AA.\n"); return (AaWork *)0; } a->type1 = type1; a->iter = 0; a->dim = dim; a->mem = MIN(mem, dim); /* for rank stability */ a->regularization = regularization; a->relaxation = relaxation; a->safeguard_factor = safeguard_factor; a->max_weight_norm = max_weight_norm; a->success = 0; a->verbosity = verbosity; if (a->mem <= 0) { return a; } a->x = (aa_float *)calloc(a->dim, sizeof(aa_float)); a->f = (aa_float *)calloc(a->dim, sizeof(aa_float)); a->g = (aa_float *)calloc(a->dim, sizeof(aa_float)); a->g_prev = (aa_float *)calloc(a->dim, sizeof(aa_float)); a->y = (aa_float *)calloc(a->dim, sizeof(aa_float)); a->s = (aa_float *)calloc(a->dim, sizeof(aa_float)); a->d = (aa_float *)calloc(a->dim, sizeof(aa_float)); a->Y = (aa_float *)calloc(a->dim * a->mem, sizeof(aa_float)); a->S = (aa_float *)calloc(a->dim * a->mem, sizeof(aa_float)); a->D = (aa_float *)calloc(a->dim * a->mem, sizeof(aa_float)); a->M = (aa_float *)calloc(a->mem * a->mem, sizeof(aa_float)); a->work = (aa_float *)calloc(MAX(a->mem, a->dim), sizeof(aa_float)); a->ipiv = (blas_int *)calloc(a->mem, sizeof(blas_int)); if (relaxation != 1.0) { a->x_work = (aa_float *)calloc(a->dim, sizeof(aa_float)); } else { a->x_work = 0; } TIME_TOC return a; } aa_float aa_apply(aa_float *f, const aa_float *x, AaWork *a) { TIME_TIC aa_float aa_norm = 0; aa_int len = MIN(a->iter, a->mem); a->success = 0; /* if we make an AA step we set this to 1 later */ if (a->mem <= 0) { TIME_TOC return aa_norm; /* 0 */ } if (a->iter == 0) { /* if first iteration then seed params for next iter */ init_accel_params(x, f, a); a->iter++; TIME_TOC return aa_norm; /* 0 */ } /* set various accel quantities */ update_accel_params(x, f, a, len); /* only perform solve steps when the memory is full */ if (!FILL_MEMORY_BEFORE_SOLVE || a->iter >= a->mem) { /* set M = S'Y or Y'Y depending on type of aa used */ set_m(a, len); /* solve linear system, new point overwrites f if successful */ aa_norm = solve(f, a, len); } a->iter++; TIME_TOC return aa_norm; } aa_int aa_safeguard(aa_float *f_new, aa_float *x_new, AaWork *a) { TIME_TIC blas_int bdim = (blas_int)a->dim; blas_int one = 1; aa_float neg_onef = -1.0; aa_float norm_diff; if (!a->success) { /* last AA update was not successful, no need for safeguarding */ TIME_TOC return 0; } /* reset success indicator in case safeguarding called multiple times */ a->success = 0; /* work = x_new */ memcpy(a->work, x_new, a->dim * sizeof(aa_float)); /* work = x_new - f_new */ BLAS(axpy)(&bdim, &neg_onef, f_new, &one, a->work, &one); /* norm_diff = || f_new - x_new || */ norm_diff = BLAS(nrm2)(&bdim, a->work, &one); /* g = f - x */ if (norm_diff > a->safeguard_factor * a->norm_g) { /* in this case we reject the AA step and reset */ memcpy(f_new, a->f, a->dim * sizeof(aa_float)); memcpy(x_new, a->x, a->dim * sizeof(aa_float)); if (a->verbosity > 0) { printf("AA rejection, iter: %i, norm_diff %.4e, prev_norm_diff %.4e\n", (int)a->iter, norm_diff, a->norm_g); } aa_reset(a); TIME_TOC return -1; } TIME_TOC return 0; } void aa_finish(AaWork *a) { if (a) { free(a->x); free(a->f); free(a->g); free(a->g_prev); free(a->y); free(a->s); free(a->d); free(a->Y); free(a->S); free(a->D); free(a->M); free(a->work); free(a->ipiv); if (a->x_work) { free(a->x_work); } free(a); } return; } void aa_reset(AaWork *a) { /* to reset we simply set a->iter = 0 */ if (a->verbosity > 0) { printf("AA reset.\n"); } a->iter = 0; return; } #endif