lib/lda-inference.c in ealdent-lda-ruby-0.2.2 vs lib/lda-inference.c in ealdent-lda-ruby-0.2.3

- old
+ new

@@ -15,14 +15,10 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 // USA -#ifndef USE_RUBY -#define USE_RUBY -#endif - #include <stdlib.h> #include <stdio.h> #include <math.h> #include <float.h> #include <string.h> @@ -48,17 +44,21 @@ /* * variational inference */ -double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) { +double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) { double converged = 1; double phisum = 0, likelihood = 0; double likelihood_old = 0, oldphi[model->num_topics]; - int k, n, var_iter; + int k = 0, n = 0, var_iter = 0, index = 0; double digamma_gam[model->num_topics]; + /* zero'em out */ + memset(digamma_gam,0.0,sizeof(digamma_gam)); + memset(oldphi,0.0,sizeof(oldphi)); + // compute posterior dirichlet for (k = 0; k < model->num_topics; k++) { var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics)); @@ -76,13 +76,20 @@ { phisum = 0; for (k = 0; k < model->num_topics; k++) { oldphi[k] = phi[n][k]; - phi[n][k] = - digamma_gam[k] + - model->log_prob_w[k][doc->words[n]]; + index = doc->words[n]; + if( index < 0 || index > model->num_terms ) { + printf("phi for term: %d of %d\n", index, model->num_terms); + phi[n][k] = 0.0; + } + else { + phi[n][k] = + digamma_gam[k] + + model->log_prob_w[k][index]; + } if (k > 0) phisum = log_sum(phisum, phi[n][k]); else phisum = phi[n][k]; // note, phi is in log space @@ -98,11 +105,12 @@ digamma_gam[k] = digamma(var_gamma[k]); } } likelihood = compute_likelihood(doc, model, phi, var_gamma); - assert(!isnan(likelihood)); + //assert(!isnan(likelihood)); + if( isnan(likelihood) ) { *errors = 1; } converged = (likelihood_old - likelihood) / likelihood_old; likelihood_old = likelihood; // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged); } @@ -114,47 +122,55 @@ * compute likelihood bound */ double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) { double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics]; - int k, n; + int k = 0, n = 0, index = 0; + memset(dig,0.0,sizeof(dig)); for (k = 0; k < model->num_topics; k++) { dig[k] = digamma(var_gamma[k]); var_gamma_sum += var_gamma[k]; } digsum = digamma(var_gamma_sum); - likelihood = lgamma(model->alpha * model -> num_topics) - model -> num_topics * lgamma(model->alpha) - (lgamma(var_gamma_sum)); + likelihood = lgamma(model->alpha * model->num_topics) - + model->num_topics * + lgamma(model->alpha) - + lgamma(var_gamma_sum); for (k = 0; k < model->num_topics; k++) { likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum); for (n = 0; n < doc->length; n++) { if (phi[n][k] > 0) { + index = doc->words[n]; likelihood += doc->counts[n]* (phi[n][k]*((dig[k] - digsum) - log(phi[n][k]) - + model->log_prob_w[k][doc->words[n]])); + + model->log_prob_w[k][index])); } } } return(likelihood); } double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) { double likelihood; int n, k; + short error = 0; - // posterior inference + // posterior inference - likelihood = lda_inference(doc, model, gamma, phi); + likelihood = lda_inference(doc, model, gamma, phi, &error); + if (error) { likelihood = 0.0; } + // update sufficient statistics double gamma_sum = 0; for (k = 0; k < model->num_topics; k++) { @@ -219,10 +235,11 @@ lda_model *model = NULL; double **var_gamma, **phi; // allocate variational parameters + var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (d = 0; d < corpus->num_docs; d++) var_gamma[d] = malloc(sizeof(double) * NTOPICS); int max_length = max_corpus_length(corpus); @@ -277,27 +294,26 @@ printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step + printf("e-step\n"); for (d = 0; d < corpus->num_docs; d++) { if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } + printf("m-step\n"); // m-step + if (VERBOSE) { + lda_mle(model, ss, ESTIMATE_ALPHA); + } else { + quiet_lda_mle(model, ss, ESTIMATE_ALPHA); + } - if (VERBOSE) { - lda_mle(model, ss, ESTIMATE_ALPHA); - } else { - quiet_lda_mle(model, ss, ESTIMATE_ALPHA); - } - - // check for convergence - converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // output model and likelihood @@ -322,14 +338,19 @@ // output the word assignments (for visualization) sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); + short error = 0; + double tl = 0.0; for (d = 0; d < corpus->num_docs; d++) { if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d); - likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi); + error = 0; + tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error); + if( error ) { continue; } + likelihood += tl; write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); } fclose(w_asgn_file); fclose(likelihood_file); } @@ -386,11 +407,12 @@ doc = &(corpus->docs[d]); phi = (double**) malloc(sizeof(double*) * doc->length); for (n = 0; n < doc->length; n++) phi[n] = (double*) malloc(sizeof(double) * model->num_topics); - likelihood = lda_inference(doc, model, var_gamma[d], phi); + short error = 0; + likelihood = lda_inference(doc, model, var_gamma[d], phi, &error); fprintf(fileptr, "%5.5f\n", likelihood); } fclose(fileptr); sprintf(filename, "%s-gamma.dat", save); @@ -446,62 +468,72 @@ #ifdef USE_RUBY /* */ void run_quiet_em(char* start, corpus* corpus) { - int d, n; + int d = 0, n = 0; lda_model *model = NULL; - double **var_gamma, **phi; + double **var_gamma = NULL, **phi = NULL; + // last_gamma is a double[num_docs][num_topics] // allocate variational parameters - var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); - for (d = 0; d < corpus->num_docs; d++) - var_gamma[d] = malloc(sizeof(double) * NTOPICS); + var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs)); + memset(var_gamma, 0.0, corpus->num_docs); + + for (d = 0; d < corpus->num_docs; ++d) { + var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS); + memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS); + } + int max_length = max_corpus_length(corpus); - phi = malloc(sizeof(double*)*max_length); - for (n = 0; n < max_length; n++) - phi[n] = malloc(sizeof(double) * NTOPICS); + phi = (double**)malloc(sizeof(double*)*max_length); + memset(phi, 0.0, max_length); + for (n = 0; n < max_length; ++n) { + phi[n] = (double*)malloc(sizeof(double) * NTOPICS); + memset(phi[n], 0.0, sizeof(double)*NTOPICS); + } + // initialize model lda_suffstats* ss = NULL; - if (strcmp(start, "seeded")==0) { + if (strncmp(start, "seeded",6)==0) { model = new_lda_model(corpus->num_terms, NTOPICS); + model->alpha = INITIAL_ALPHA; ss = new_lda_suffstats(model); if (VERBOSE) { - corpus_initialize_ss(ss, model, corpus); - } else { - quiet_corpus_initialize_ss(ss, model, corpus); - } + corpus_initialize_ss(ss, model, corpus); + } else { + quiet_corpus_initialize_ss(ss, model, corpus); + } if (VERBOSE) { - lda_mle(model, ss, 0); + lda_mle(model, ss, 0); } else { - quiet_lda_mle(model, ss, 0); + quiet_lda_mle(model, ss, 0); } + } else if (strncmp(start, "fixed",5)==0) { + model = new_lda_model(corpus->num_terms, NTOPICS); + model->alpha = INITIAL_ALPHA; + ss = new_lda_suffstats(model); + corpus_initialize_fixed_ss(ss, model, corpus); + if (VERBOSE) { + lda_mle(model, ss, 0); + } else { + quiet_lda_mle(model, ss, 0); + } + } else if (strncmp(start, "random",6)==0) { + model = new_lda_model(corpus->num_terms, NTOPICS); model->alpha = INITIAL_ALPHA; - } else if (strcmp(start, "fixed")==0) { - model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); - corpus_initialize_fixed_ss(ss, model, corpus); - if (VERBOSE) { - lda_mle(model, ss, 0); - } else { - quiet_lda_mle(model, ss, 0); - } - model->alpha = INITIAL_ALPHA; - } else if (strcmp(start, "random")==0) { - model = new_lda_model(corpus->num_terms, NTOPICS); - ss = new_lda_suffstats(model); random_initialize_ss(ss, model); if (VERBOSE) { - lda_mle(model, ss, 0); + lda_mle(model, ss, 0); } else { - quiet_lda_mle(model, ss, 0); + quiet_lda_mle(model, ss, 0); } - model->alpha = INITIAL_ALPHA; } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } @@ -510,16 +542,15 @@ model_loaded = TRUE; // run expectation maximization int i = 0; - double likelihood, likelihood_old = 0, converged = 1; + double likelihood = 0.0, likelihood_old = 0, converged = 1; while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; - if (VERBOSE) - printf("**** em iteration %d ****\n", i); + if (VERBOSE) printf("**** em iteration %d ****\n", i); likelihood = 0; zero_initialize_ss(ss, model); // e-step @@ -527,36 +558,37 @@ if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } // m-step + if (VERBOSE) { + lda_mle(model, ss, ESTIMATE_ALPHA); + } else { + quiet_lda_mle(model, ss, ESTIMATE_ALPHA); + } - if (VERBOSE) { - lda_mle(model, ss, ESTIMATE_ALPHA); - } else { - quiet_lda_mle(model, ss, ESTIMATE_ALPHA); - } - // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // store model and likelihood last_model = model; last_gamma = var_gamma; - last_phi = phi; + last_phi = phi; } // output the final model last_model = model; last_gamma = var_gamma; - last_phi = phi; + last_phi = phi; + free_lda_suffstats(model,ss); + // output the word assignments (for visualization) /* char filename[100]; sprintf(filename, "%s/word-assignments.dat", directory); FILE* w_asgn_file = fopen(filename, "w"); @@ -583,10 +615,11 @@ * * est_alpha */ static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) { INITIAL_ALPHA = NUM2DBL(init_alpha); NTOPICS = NUM2INT(num_topics); + if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); } VAR_MAX_ITER = NUM2INT(max_iter); VAR_CONVERGED = (float)NUM2DBL(convergence); EM_MAX_ITER = NUM2INT(em_max_iter); EM_CONVERGED = (float)NUM2DBL(em_convergence); ESTIMATE_ALPHA = NUM2INT(est_alpha); @@ -796,12 +829,15 @@ c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length")); c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total")); c->docs[i].words = malloc(sizeof(int) * c->docs[i].length); c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length); for (j = 0; j < c->docs[i].length; j++) { - VALUE one_word = NUM2INT(rb_ary_entry(words, j)); - VALUE one_count = NUM2INT(rb_ary_entry(counts, j)); + int one_word = NUM2INT(rb_ary_entry(words, j)); + int one_count = NUM2INT(rb_ary_entry(counts, j)); + if( one_word > c->num_terms ) { + rb_raise(rb_eRuntimeError, "error term count(%d) less then word index(%d)", c->num_terms, one_word); + } c->docs[i].words[j] = one_word; c->docs[i].counts[j] = one_count; } } @@ -848,16 +884,17 @@ return Qnil; VALUE arr = rb_ary_new2(last_corpus->num_docs); int i = 0, j = 0, k = 0; - int max_length = max_corpus_length(last_corpus); + //int max_length = max_corpus_length(last_corpus); + short error = 0; for (i = 0; i < last_corpus->num_docs; i++) { VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length); - lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi); + lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error); for (j = 0; j < last_corpus->docs[i].length; j++) { VALUE arr2 = rb_ary_new2(last_model->num_topics); for (k = 0; k < last_model->num_topics; k++) { @@ -966,6 +1003,6 @@ rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0); rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0); rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0); } -#endif \ No newline at end of file +#endif