lda-inference.c in ealdent-lda-ruby-0.2.3

- old
+ new

@@ -15,14 +15,10 @@
 // You should have received a copy of the GNU General Public License
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 // USA
 
-#ifndef USE_RUBY
-#define USE_RUBY
-#endif
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <float.h>
 #include <string.h>
@@ -48,17 +44,21 @@
 
 /*
  * variational inference
  */
 
-double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) {
+double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) {
 	double converged = 1;
 	double phisum = 0, likelihood = 0;
 	double likelihood_old = 0, oldphi[model->num_topics];
-	int k, n, var_iter;
+	int k = 0, n = 0, var_iter = 0, index = 0;
 	double digamma_gam[model->num_topics];
 
+  /* zero'em out */
+  memset(digamma_gam,0.0,sizeof(digamma_gam));
+  memset(oldphi,0.0,sizeof(oldphi));
+
 		// compute posterior dirichlet
 
 	for (k = 0; k < model->num_topics; k++)
 	{
 		var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
@@ -76,13 +76,20 @@
 		{
 			phisum = 0;
 			for (k = 0; k < model->num_topics; k++)
 			{
 				oldphi[k] = phi[n][k];
-				phi[n][k] =
-					digamma_gam[k] +
-					model->log_prob_w[k][doc->words[n]];
+        index = doc->words[n];
+        if( index < 0 || index > model->num_terms ) {
+          printf("phi for term: %d of %d\n", index, model->num_terms);
+				  phi[n][k] = 0.0;
+        }
+        else { 
+				  phi[n][k] =
+				  	digamma_gam[k] +
+				  	model->log_prob_w[k][index];
+        }
 
 				if (k > 0)
 					phisum = log_sum(phisum, phi[n][k]);
 				else
 					phisum = phi[n][k]; // note, phi is in log space
@@ -98,11 +105,12 @@
 				digamma_gam[k] = digamma(var_gamma[k]);
 			}
 		}
 
 		likelihood = compute_likelihood(doc, model, phi, var_gamma);
-		assert(!isnan(likelihood));
+		//assert(!isnan(likelihood));
+    if( isnan(likelihood) ) { *errors = 1; }
 		converged = (likelihood_old - likelihood) / likelihood_old;
 		likelihood_old = likelihood;
 
 				// printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
 	}
@@ -114,47 +122,55 @@
  * compute likelihood bound
  */
 
 double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
 	double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
-	int k, n;
+	int k = 0, n = 0, index = 0;
+  memset(dig,0.0,sizeof(dig));
 
 	for (k = 0; k < model->num_topics; k++)
 	{
 		dig[k] = digamma(var_gamma[k]);
 		var_gamma_sum += var_gamma[k];
 	}
 	digsum = digamma(var_gamma_sum);
 
-	likelihood = lgamma(model->alpha * model -> num_topics) - model -> num_topics * lgamma(model->alpha) - (lgamma(var_gamma_sum));
+	likelihood = lgamma(model->alpha * model->num_topics) -
+               model->num_topics *
+               lgamma(model->alpha) -
+               lgamma(var_gamma_sum);
 
 	for (k = 0; k < model->num_topics; k++)
 	{
 		likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum);
 		
 		for (n = 0; n < doc->length; n++)
 		{
 			if (phi[n][k] > 0)
 			{
+        index = doc->words[n];
 				likelihood += doc->counts[n]*
 					(phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
-					+ model->log_prob_w[k][doc->words[n]]));
+					+ model->log_prob_w[k][index]));
 			}
 		}
 	}
 	return(likelihood);
 }
 
 
 double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
 	double likelihood;
 	int n, k;
+  short error = 0;
 
-		// posterior inference
+  // posterior inference
 
-	likelihood = lda_inference(doc, model, gamma, phi);
+	likelihood = lda_inference(doc, model, gamma, phi, &error);
+  if (error) { likelihood = 0.0; }
 
+
 		// update sufficient statistics
 
 	double gamma_sum = 0;
 	for (k = 0; k < model->num_topics; k++)
 	{
@@ -219,10 +235,11 @@
 	lda_model *model = NULL;
 	double **var_gamma, **phi;
 
 	// allocate variational parameters
 
+
 	var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
 	for (d = 0; d < corpus->num_docs; d++)
 		var_gamma[d] = malloc(sizeof(double) * NTOPICS);
 
 	int max_length = max_corpus_length(corpus);
@@ -277,27 +294,26 @@
 		    printf("**** em iteration %d ****\n", i);
 		likelihood = 0;
 		zero_initialize_ss(ss, model);
 
 		// e-step
+    printf("e-step\n");
 
 		for (d = 0; d < corpus->num_docs; d++) {
 			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
 			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
 		}
+    printf("m-step\n");
 
 		// m-step
+    if (VERBOSE) {
+      lda_mle(model, ss, ESTIMATE_ALPHA);
+    } else {
+      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
+    }
 
-        if (VERBOSE) {
-            lda_mle(model, ss, ESTIMATE_ALPHA);
-        } else {
-            quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
-        }
-		
-
 		// check for convergence
-
 		converged = (likelihood_old - likelihood) / (likelihood_old);
 		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
 		likelihood_old = likelihood;
 
 		// output model and likelihood
@@ -322,14 +338,19 @@
 
 		// output the word assignments (for visualization)
 
 	sprintf(filename, "%s/word-assignments.dat", directory);
 	FILE* w_asgn_file = fopen(filename, "w");
+  short error = 0;
+  double tl = 0.0;
 	for (d = 0; d < corpus->num_docs; d++)
 	{
 		if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
-		likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
+    error = 0;
+    tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
+    if( error ) { continue; }
+		likelihood += tl;
 		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
 	}
 	fclose(w_asgn_file);
 	fclose(likelihood_file);
 }
@@ -386,11 +407,12 @@
 
 		doc = &(corpus->docs[d]);
 		phi = (double**) malloc(sizeof(double*) * doc->length);
 		for (n = 0; n < doc->length; n++)
 			phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
-		likelihood = lda_inference(doc, model, var_gamma[d], phi);
+    short error = 0;
+		likelihood = lda_inference(doc, model, var_gamma[d], phi, &error);
 
 		fprintf(fileptr, "%5.5f\n", likelihood);
 	}
 	fclose(fileptr);
 	sprintf(filename, "%s-gamma.dat", save);
@@ -446,62 +468,72 @@
 
 #ifdef USE_RUBY
 
 /* */
 void run_quiet_em(char* start, corpus* corpus) {
-	int d, n;
+	int d = 0, n = 0;
 	lda_model *model = NULL;
-	double **var_gamma, **phi;
+	double **var_gamma = NULL, **phi = NULL;
+	// last_gamma is a double[num_docs][num_topics]
 
 	// allocate variational parameters
 
-	var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
-	for (d = 0; d < corpus->num_docs; d++)
-		var_gamma[d] = malloc(sizeof(double) * NTOPICS);
 
+	var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
+  memset(var_gamma, 0.0, corpus->num_docs);
+
+	for (d = 0; d < corpus->num_docs; ++d) {
+		var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
+    memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
+  }
+
 	int max_length = max_corpus_length(corpus);
-	phi = malloc(sizeof(double*)*max_length);
-	for (n = 0; n < max_length; n++)
-		phi[n] = malloc(sizeof(double) * NTOPICS);
 
+	phi = (double**)malloc(sizeof(double*)*max_length);
+  memset(phi, 0.0, max_length);
+	for (n = 0; n < max_length; ++n) {
+		phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
+    memset(phi[n], 0.0, sizeof(double)*NTOPICS);
+  }
+
 	// initialize model
 
 	lda_suffstats* ss = NULL;
-	if (strcmp(start, "seeded")==0) {
+	if (strncmp(start, "seeded",6)==0) {
 		model = new_lda_model(corpus->num_terms, NTOPICS);
+		model->alpha = INITIAL_ALPHA;
 		ss = new_lda_suffstats(model);
 		if (VERBOSE) {
-		    corpus_initialize_ss(ss, model, corpus);
-	    } else {
-            quiet_corpus_initialize_ss(ss, model, corpus);
-	    }
+      corpus_initialize_ss(ss, model, corpus);
+    } else {
+      quiet_corpus_initialize_ss(ss, model, corpus);
+    }
 		if (VERBOSE) {
-		    lda_mle(model, ss, 0);
+      lda_mle(model, ss, 0);
 		} else {
-		    quiet_lda_mle(model, ss, 0);
+      quiet_lda_mle(model, ss, 0);
 		}
+	} else if (strncmp(start, "fixed",5)==0) {
+	  model = new_lda_model(corpus->num_terms, NTOPICS);
+    model->alpha = INITIAL_ALPHA;
+	  ss = new_lda_suffstats(model);
+	  corpus_initialize_fixed_ss(ss, model, corpus);
+    if (VERBOSE) {
+      lda_mle(model, ss, 0);
+    } else {
+      quiet_lda_mle(model, ss, 0);
+    }
+	} else if (strncmp(start, "random",6)==0) {
+		model = new_lda_model(corpus->num_terms, NTOPICS);
 		model->alpha = INITIAL_ALPHA;
-	} else if (strcmp(start, "fixed")==0) {
-	    model = new_lda_model(corpus->num_terms, NTOPICS);
 		ss = new_lda_suffstats(model);
-		corpus_initialize_fixed_ss(ss, model, corpus);
-		if (VERBOSE) {
-		    lda_mle(model, ss, 0);
-		} else {
-		    quiet_lda_mle(model, ss, 0);
-		}
-		model->alpha = INITIAL_ALPHA;
-	} else if (strcmp(start, "random")==0) {
-		model = new_lda_model(corpus->num_terms, NTOPICS);
-		ss = new_lda_suffstats(model);
 		random_initialize_ss(ss, model);
 		if (VERBOSE) {
-		    lda_mle(model, ss, 0);
+      lda_mle(model, ss, 0);
 		} else {
-		    quiet_lda_mle(model, ss, 0);
+      quiet_lda_mle(model, ss, 0);
 		}
-		model->alpha = INITIAL_ALPHA;
 	} else {
 		model = load_lda_model(start);
 		ss = new_lda_suffstats(model);
 	}
 
@@ -510,16 +542,15 @@
 	model_loaded = TRUE;
 
 	// run expectation maximization
 
 	int i = 0;
-	double likelihood, likelihood_old = 0, converged = 1;
+	double likelihood = 0.0, likelihood_old = 0, converged = 1;
 
 	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
 		i++;
-		if (VERBOSE)
-		    printf("**** em iteration %d ****\n", i);
+		if (VERBOSE) printf("**** em iteration %d ****\n", i);
 		likelihood = 0;
 		zero_initialize_ss(ss, model);
 
 		// e-step
 
@@ -527,36 +558,37 @@
 			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
 			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
 		}
 
 		// m-step
+    if (VERBOSE) {
+      lda_mle(model, ss, ESTIMATE_ALPHA);
+    } else {
+      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
+    }
 
-        if (VERBOSE) {
-            lda_mle(model, ss, ESTIMATE_ALPHA);
-        } else {
-            quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
-        }
-
 		// check for convergence
 
 		converged = (likelihood_old - likelihood) / (likelihood_old);
 		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
 		likelihood_old = likelihood;
 
 		// store model and likelihood
 
 		last_model = model;
 		last_gamma = var_gamma;
-        last_phi = phi;
+    last_phi = phi;
 	}
 
 	// output the final model
 
 	last_model = model;
 	last_gamma = var_gamma;
-    last_phi = phi;
+  last_phi = phi;
 
+  free_lda_suffstats(model,ss);
+
 	// output the word assignments (for visualization)
 	/*
 	char filename[100];
 	sprintf(filename, "%s/word-assignments.dat", directory);
 	FILE* w_asgn_file = fopen(filename, "w");
@@ -583,10 +615,11 @@
  *  * est_alpha
  */
 static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
 	INITIAL_ALPHA = NUM2DBL(init_alpha);
 	NTOPICS = NUM2INT(num_topics);
+  if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
 	VAR_MAX_ITER = NUM2INT(max_iter);
 	VAR_CONVERGED = (float)NUM2DBL(convergence);
 	EM_MAX_ITER = NUM2INT(em_max_iter);
 	EM_CONVERGED = (float)NUM2DBL(em_convergence);
 	ESTIMATE_ALPHA = NUM2INT(est_alpha);
@@ -796,12 +829,15 @@
 		c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
 		c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
 		c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
 		c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
 		for (j = 0; j < c->docs[i].length; j++) {
-			VALUE one_word = NUM2INT(rb_ary_entry(words, j));
-			VALUE one_count = NUM2INT(rb_ary_entry(counts, j));
+			int one_word = NUM2INT(rb_ary_entry(words, j));
+			int one_count = NUM2INT(rb_ary_entry(counts, j));
+      if( one_word > c->num_terms ) {
+        rb_raise(rb_eRuntimeError, "error term count(%d) less then word index(%d)", c->num_terms, one_word);
+      }
 			c->docs[i].words[j] = one_word;
 			c->docs[i].counts[j] = one_count;
 		}
 	}
 	
@@ -848,16 +884,17 @@
         return Qnil;
     
     VALUE arr = rb_ary_new2(last_corpus->num_docs);
     int i = 0, j = 0, k = 0;
     
-    int max_length = max_corpus_length(last_corpus);
+    //int max_length = max_corpus_length(last_corpus);
+    short error = 0;
     
     for (i = 0; i < last_corpus->num_docs; i++) {
         VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
         
-        lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi);
+        lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
         
         for (j = 0; j < last_corpus->docs[i].length; j++) {
             VALUE arr2 = rb_ary_new2(last_model->num_topics);
             
             for (k = 0; k < last_model->num_topics; k++) {
@@ -966,6 +1003,6 @@
   rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
   rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0);
   rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
 }
 
-#endif
\ No newline at end of file
+#endif