lib/phonetics.rb in phonetics-1.1.1 vs lib/phonetics.rb in phonetics-1.5.0
- old
+ new
@@ -221,26 +221,103 @@
Symbols = Consonants.phonemes.reduce({}) {|acc, p| acc.update p => :consonant }.merge(
Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
)
def distance(phoneme1, phoneme2)
- types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
- if types == [:consonant, :vowel]
- 1.0
- elsif types == [:vowel, :vowel]
- Vowels.distance(phoneme1, phoneme2)
- elsif types == [:consonant, :consonant]
- Consonants.distance(phoneme1, phoneme2)
- end
+ return 0 if phoneme1 == phoneme2
+ distance_map.fetch(phoneme1).fetch(phoneme2)
end
def distance_map
@distance_map ||= (
Vowels.phonemes + Consonants.phonemes
).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} } ) do |pair, scores|
p1, p2 = *pair
- score = distance(p1, p2)
+ score = _distance(p1, p2)
scores[p1][p2] = score
scores[p2][p1] = score
+ end
+ end
+
+ # as_utf_8_long("aɰ̊ h")
+ # => [97, 8404, 32, 104]
+ def as_utf_8_long(string)
+ string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
+ end
+
+ # Encode individual multi-byte strings as a single integer.
+ #
+ # "ɰ̊".unpack('U*')
+ # => [624, 778]
+ #
+ # grapheme_as_utf_8_long("ɰ̊")
+ # => 1413 (624 + (10 * 778))
+ def grapheme_as_utf_8_long(grapheme)
+ grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
+ total += (10**i) * byte
+ end
+ end
+
+ # This will print a C code file with a function that implements a two-level C
+ # switch like the following:
+ #
+ # switch (a) {
+ # case 100: // 'd'
+ # switch (b) {
+ # case 618: // 'ɪ'
+ # return (float) 0.73827;
+ # break;
+ # }
+ # }
+ #
+ def generate_phonetic_cost_c_code(writer = STDOUT)
+ # First, flatten the bytes of the runes (unicode codepoints encoded via
+ # UTF-8) into single integers. We do this by adding the utf-8 values, each
+ # multiplied by 10 * their byte number. The specific encoding doesn't
+ # matter so long as it's:
+ # * consistent
+ # * has no collisions
+ # * produces a value that's a valid C case conditional
+ # * can be applied to runes of input strings later
+ integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
+ acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
+ acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
+ end)
+ end
+
+ # Then we print out C code full of switches
+
+ writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
+ float phonetic_cost(int a, int b) {
+ // This is compiled from Ruby, using `String#unpack("U")` on each character
+ // to retrieve the UTF-8 codepoint as a C long value.
+ if (a == b) { return 0.0; };
+ FUNC
+ writer.puts ' switch (a) {'
+ integer_distance_map.each do |(a, a_i), distances|
+ writer.puts " case #{a_i}: // #{a}"
+ writer.puts ' switch (b) {'
+ distances.each do |(b, b_i), distance|
+ writer.puts " case #{b_i}: // #{a}->#{b}"
+ writer.puts " return (float) #{distance};"
+ writer.puts " break;"
+ end
+ writer.puts ' }'
+ end
+ writer.puts ' }'
+ writer.puts ' return 1.0;'
+ writer.puts '}'
+ end
+
+ private
+
+ def _distance(phoneme1, phoneme2)
+ types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
+ if types == [:consonant, :vowel]
+ 1.0
+ elsif types == [:vowel, :vowel]
+ Vowels.distance(phoneme1, phoneme2)
+ elsif types == [:consonant, :consonant]
+ Consonants.distance(phoneme1, phoneme2)
end
end
end