ext/c_levenshtein/levenshtein.c in phonetics-1.5.4 vs ext/c_levenshtein/levenshtein.c in phonetics-1.8.0
- old
+ new
@@ -1,38 +1,50 @@
-#include "ruby.h"
#include <stdbool.h>
+#include "ruby.h"
+#include "ruby/encoding.h"
+#include "ruby/re.h"
+#include "./phonemes.h"
+#include "./next_phoneme_length.h"
#include "./phonetic_cost.h"
#define debug(M, ...) if (verbose) printf(M, ##__VA_ARGS__)
VALUE Binding = Qnil;
/* Function declarations */
void Init_c_levenshtein();
-void set_initial(double *d, int *string1, int string1_length, int *string2, int string2_length, bool verbose);
-void print_matrix(double *d, int *string1, int string1_length, int *string2, int string2_length, bool verbose);
+void set_initial(double *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose);
+void print_matrix(double *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose);
VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _string2, VALUE _verbose);
/* Function implemitations */
void Init_c_levenshtein() {
Binding = rb_define_module("PhoneticsLevenshteinCBinding");
rb_define_method(Binding, "internal_phonetic_distance", method_internal_phonetic_distance, 3);
}
VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _string2, VALUE _verbose){
+ if (!RB_TYPE_P(_string1, T_STRING)) {
+ rb_raise(rb_eArgError, "must pass string as first argument");
+ }
+ if (!RB_TYPE_P(_string2, T_STRING)) {
+ rb_raise(rb_eArgError, "must pass string as second argument");
+ }
- VALUE *string1_ruby = RARRAY_PTR(_string1);
- VALUE *string2_ruby = RARRAY_PTR(_string2);
bool verbose = _verbose;
- int string1_length = (int) RARRAY_LEN(_string1);
- int string2_length = (int) RARRAY_LEN(_string2);
- // We name them as 'strings' but in C-land we're representing our strings as
- // arrays of `int`s, where each int represents a consistent (if unusual)
- // encoding of a grapheme cluster (a symbol for a phoneme).
+ int string1_length = (int) RSTRING_LEN(_string1);
+ int string2_length = (int) RSTRING_LEN(_string2);
+
+ // Given the input strings, we count the phonemes in each and store both the
+ // total and, in a phoneme_sizes array, the length of each.
+ int string1_phoneme_count = 0;
+ int string2_phoneme_count = 0;
+ int string1_phoneme_sizes[string1_length + 1];
+ int string2_phoneme_sizes[string2_length + 1];
int string1[string1_length + 1];
int string2[string2_length + 1];
double *d; // The (flattened) 2-dimensional matrix
// underlying this algorithm
@@ -40,163 +52,179 @@
double distance; // Return value of this function
double min, delete, // Reusable cost calculations
insert, replace,
cost;
int i, j; // Frequently overwritten loop vars
+ int string1_offset = 0;
+ int string2_offset = 0;
- // Guard clause for two empty strings
- if (string1_length == 0 && string2_length == 0)
- return DBL2NUM(0.0);
-
- //
- // Intial data setup
- //
-
for (i = 0; i < string1_length; i++) {
- string1[i] = NUM2INT(string1_ruby[i]);
- debug("string1[%d] = %d\n", i, string1[i]);
+ string1[i] = (RSTRING_PTR(_string1)[i] & 0xff);
}
- for (j = 0; j < string2_length; j++) {
- string2[j] = NUM2INT(string2_ruby[j]);
- debug("string2[%d] = %d\n", i, string2[j]);
+ for (i = 0; i < string2_length; i++) {
+ string2[i] = RSTRING_PTR(_string2)[i] & 0xff;
}
- // one-dimensional representation of 2 dimentional array len(string1)+1 *
- // len(string2)+1
- d = malloc((sizeof(double)) * (string1_length+1) * (string2_length+1));
+ find_phonemes(string1, string1_length, &string1_phoneme_count, string1_phoneme_sizes);
+ find_phonemes(string2, string2_length, &string2_phoneme_count, string2_phoneme_sizes);
- //
- // Fill in the (flattened) matrix using the Levenshtein algorithm so we can
- // pluck the lowest-cost edit distance (stored in the lower-right corner, in
- // this case the last spot in the array)
- //
+ // Guard clauses for empty strings
+ if (string1_phoneme_count == 0 && string2_phoneme_count == 0)
+ return DBL2NUM(0.0);
+ // one-dimensional representation of 2 dimensional array
+ d = malloc((sizeof(double)) * (string1_phoneme_count+1) * (string2_phoneme_count+1));
+
// First, set the top row and left column of the matrix using the sequential
// phonetic edit distance of string1 and string2, respectively
- set_initial(d, string1, string1_length, string2, string2_length, verbose);
+ set_initial(d, string1, string1_phoneme_count, string1_phoneme_sizes, string2, string2_phoneme_count, string2_phoneme_sizes, verbose);
- debug("before:\n");
- print_matrix(d, string1, string1_length, string2, string2_length, verbose);
+ print_matrix(d, string1, string1_phoneme_count, string1_phoneme_sizes, string2, string2_phoneme_count, string2_phoneme_sizes, verbose);
- // Then walk through the matrix and fill in each cell with the lowest-cost
- // phonetic edit distance for that matrix cell.
+ // Then Fill in the (flattened) matrix using the Levenshtein algorithm so we can
+ // pluck the lowest-cost edit distance (stored in the lower-right corner, in
+ // this case the last spot in the array).
+ // We'll use phonetic distance instead of '1' as the edit cost.
+ //
// (Skipping i=0 and j=0 because set_initial filled in all cells where i
// or j are zero-valued)
- for (j = 1; j <= string2_length; j++){
- for (i = 1; i <= string1_length; i++){
+ for (j = 1; j <= string2_phoneme_count; j++){
+ for (i = 1; i <= string1_phoneme_count; i++){
+
// The cost of deletion or addition is the Levenshtein distance
// calculation (the value in the cell to the left, upper-left, or above)
// plus the phonetic distance between the sound we're moving from to the
// new one.
- debug("------- %d/%d (%d) \n", i, j, j*(string1_length+1) + i);
+ debug("------- %d/%d (%d) \n", i, j, j*(string1_phoneme_count+1) + i);
- cost = phonetic_cost(string1[i-1], string2[j-1]);
- debug("phonetic cost of %d to %d is %f\n", string1[i-1], string2[j-1], cost);
+ // TODO: increment i and j by the phoneme lengths
+ cost = phonetic_cost(string1, string1_offset, string1_phoneme_sizes[i-1], string2, string2_offset, string2_phoneme_sizes[j-1]);
- insert = d[j*(string1_length+1) + i-1];
+ insert = d[j*(string1_phoneme_count+1) + i-1];
debug("insert proposes cell %d,%d - %f\n", i-1, j, insert);
min = insert;
debug("min (insert): %f\n", min);
- delete = d[(j-1)*(string1_length+1) + i];
+ delete = d[(j-1)*(string1_phoneme_count+1) + i];
debug("delete proposes cell %d,%d - %f\n", i, j-1, delete);
if (delete < min) {
debug("delete is %f, better than %f for %d/%d\n", delete, min, i, j);
min = delete;
}
- replace = d[(j-1)*(string1_length+1) + i-1];
+ replace = d[(j-1)*(string1_phoneme_count+1) + i-1];
debug("replace proposes cell %d,%d - %f\n", i-1, j-1, replace);
if (replace < min) {
debug("replace is %f, better than %f for %d/%d\n", replace, min, i, j);
min = replace;
}
- d[(j * (string1_length+1)) + i] = min + cost;
+ d[(j * (string1_phoneme_count+1)) + i] = min + cost;
debug("\n");
- print_matrix(d, string1, string1_length, string2, string2_length, verbose);
+ print_matrix(d, string1, string1_phoneme_count, string1_phoneme_sizes, string2, string2_phoneme_count, string2_phoneme_sizes, verbose);
+
+ string1_offset += string1_phoneme_sizes[i-1];
}
+ string1_offset = 0;
+ string2_offset += string2_phoneme_sizes[j-1];
}
// The final element in the `d` array is the value of the shortest path from
// the top-left to the bottom-right of the matrix.
- distance = d[(string1_length + 1) * (string2_length + 1) - 1];
+ distance = d[(string1_phoneme_count + 1) * (string2_phoneme_count + 1) - 1];
free(d);
debug("distance: %f\n", distance);
return DBL2NUM(distance);
}
-
// Set the minimum scores equal to the distance between each phoneme,
// sequentially.
//
// The first value is always zero.
// The second value is always the phonetic distance between the first
// phonemes of each string.
// Subsequent values are the cumulative phonetic distance between each
// phoneme within the same string.
// "aek" -> [0.0, 1.0, 1.61, 2.61]
-void set_initial(double *d, int *string1, int string1_length, int *string2, int string2_length, bool verbose) {
+void set_initial(double *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose) {
double initial_distance;
+ int string1_offset = 0;
+ int string2_offset = 0;
int i, j;
- if (string1_length == 0 || string2_length == 0) {
+ if (string1_phoneme_count == 0 || string2_phoneme_count == 0) {
initial_distance = 0.0;
} else {
initial_distance = 1.0;
}
// The top-left is 0, the cell to the right and down are each 1 to start
d[0] = (double) 0.0;
- if (string1_length > 0) {
+ if (string1_phoneme_count > 0) {
d[1] = initial_distance;
}
- if (string2_length > 0) {
- d[string1_length+1] = initial_distance;
+ if (string2_phoneme_count > 0) {
+ d[string1_phoneme_count+1] = initial_distance;
}
- debug("string1 length: %d\n", string1_length);
- for (i=2; i <= string1_length; i++) {
+ debug("string1 phoneme count: %d\n", string1_phoneme_count);
+
+ for (i=2; i <= string1_phoneme_count; i++) {
// The cost of adding the next phoneme is the cost so far plus the phonetic
// distance between the previous one and the current one.
- d[i] = d[i-1] + phonetic_cost(string1[i-2], string1[i-1]);
+ d[i] = d[i-1] +
+ phonetic_cost(string1, string1_offset, string1_phoneme_sizes[i-2], string1, string1_offset + string1_phoneme_sizes[i-2], string1_phoneme_sizes[i-1]);
+ string1_offset += string1_phoneme_sizes[i-2];
}
- debug("string2 length: %d\n", string2_length);
- for (j=2; j <= string2_length; j++) {
+
+ debug("string2 phoneme count: %d\n", string2_phoneme_count);
+
+ for (j=2; j <= string2_phoneme_count; j++) {
// The same exact pattern down the left side of the matrix
- d[j * (string1_length+1)] = d[(j - 1) * (string1_length+1)] + phonetic_cost(string2[j-2], string2[j-1]);
+ d[j * (string1_phoneme_count+1)] = d[(j - 1) * (string1_phoneme_count+1)] +
+ phonetic_cost(string2, string2_offset, string2_phoneme_sizes[j-2], string2, string2_offset + string2_phoneme_sizes[j-2], string2_phoneme_sizes[j-1]);
+ string2_offset += string2_phoneme_sizes[j-1];
}
- // And zero out the rest. If you're reading this please edit this to be
- // faster.
- for (j=1; j <= string2_length; j++) {
- for (i=1; i <= string1_length; i++) {
- d[j * (string1_length+1) + i] = (double) 0.0;
+ // And zero out the rest. If you're reading this please show me a way to do
+ // this faster.
+ for (j=1; j <= string2_phoneme_count; j++) {
+ for (i=1; i <= string1_phoneme_count; i++) {
+ d[j * (string1_phoneme_count+1) + i] = (double) 0.0;
}
}
}
// A handy visualization for developers
-void print_matrix(double *d, int *string1, int string1_length, int *string2, int string2_length, bool verbose) {
+void print_matrix(double *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose) {
+
int i, j;
- debug(" ");
- for (i=0; i < string1_length; i++) {
- debug("%8.d ", string1[i]);
+ int string1_offset = 0;
+ int string2_offset = 0;
+
+ if (!verbose)
+ return;
+
+ printf(" ");
+ for (i=0; i < string1_phoneme_count; i++) {
+ print_phoneme(string1, string1_offset, string1_phoneme_sizes[i], 9);
+ string1_offset += string1_phoneme_sizes[i];
}
- debug("\n");
- for (j=0; j <= string2_length; j++) {
+ printf("\n");
+ for (j=0; j <= string2_phoneme_count; j++) {
if (j==0) {
- debug(" ");
+ printf(" ");
} else {
- debug("%4.d ", string2[j-1]);
+ print_phoneme(string2, string2_offset, string2_phoneme_sizes[j-1], 2);
+ string2_offset += string2_phoneme_sizes[j-1];
+ }
+ for (i=0; i <= string1_phoneme_count; i++) {
+ printf("%f ", d[j * (string1_phoneme_count+1) + i]) ;
}
- for (i=0; i <= string1_length; i++) {
- debug("%f ", d[j * (string1_length+1) + i]) ;
- }
- debug("\n");
+ printf("\n");
}
}