lib/phonetics.rb in phonetics-1.5.4 vs lib/phonetics.rb in phonetics-1.8.0
- old
+ new
@@ -110,26 +110,10 @@
end
module Consonants
extend self
- # Plosives and fricatives are less similar than trills and flaps, or
- # sibilant fricatives and non-sibilant fricatives
- # TODO: this is unfinished and possibly a bad idea
- MannerDistances = {
- 'Nasal' => %w[continuant],
- 'Stop' => %w[],
- 'Sibilant fricative' => %w[continuant fricative],
- 'Non-sibilant fricative' => %w[continuant non_sibilant fricative],
- 'Approximant' => %w[],
- 'Tap/Flap' => %w[],
- 'Trill' => %w[],
- 'Lateral fricative' => %w[continuant fricative],
- 'Lateral approximant' => %w[],
- 'Lateral tap/flap' => %w[],
- }.freeze
-
# This chart (columns 2 through the end, anyway) is a direct port of
# https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
# We store the consonant table in this format to make updating it easier.
#
# rubocop:disable Layout/TrailingWhitespace
@@ -216,11 +200,11 @@
penalty
end
end
def phonemes
- Consonants.phonemes + Vowels.phonemes
+ Vowels.phonemes + Consonants.phonemes
end
Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
)
@@ -230,87 +214,15 @@
distance_map.fetch(phoneme1).fetch(phoneme2)
end
def distance_map
- @distance_map ||= (
- Vowels.phonemes + Consonants.phonemes
- ).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
+ @distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
p1, p2 = *pair
score = _distance(p1, p2)
scores[p1][p2] = score
scores[p2][p1] = score
end
- end
-
- # as_utf_8_long("aɰ̊ h")
- # => [97, 8404, 32, 104]
- def as_utf_8_long(string)
- string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
- end
-
- # Encode individual multi-byte strings as a single integer.
- #
- # "ɰ̊".unpack('U*')
- # => [624, 778]
- #
- # grapheme_as_utf_8_long("ɰ̊")
- # => 1413 (624 + (10 * 778))
- def grapheme_as_utf_8_long(grapheme)
- grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
- total + (10**i) * byte
- end
- end
-
- # This will print a C code file with a function that implements a two-level C
- # switch like the following:
- #
- # switch (a) {
- # case 100: // 'd'
- # switch (b) {
- # case 618: // 'ɪ'
- # return (float) 0.73827;
- # break;
- # }
- # }
- #
- def generate_phonetic_cost_c_code(writer = STDOUT)
- # First, flatten the bytes of the runes (unicode codepoints encoded via
- # UTF-8) into single integers. We do this by adding the utf-8 values, each
- # multiplied by 10 * their byte number. The specific encoding doesn't
- # matter so long as it's:
- # * consistent
- # * has no collisions
- # * produces a value that's a valid C case conditional
- # * can be applied to runes of input strings later
- integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
- acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
- acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
- end)
- end
-
- # Then we print out C code full of switches
-
- writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
- float phonetic_cost(int a, int b) {
- // This is compiled from Ruby, using `String#unpack("U")` on each character
- // to retrieve the UTF-8 codepoint as a C long value.
- if (a == b) { return 0.0; };
- FUNC
- writer.puts ' switch (a) {'
- integer_distance_map.each do |(a, a_i), distances|
- writer.puts " case #{a_i}: // #{a}"
- writer.puts ' switch (b) {'
- distances.each do |(b, b_i), distance|
- writer.puts " case #{b_i}: // #{a}->#{b}"
- writer.puts " return (float) #{distance};"
- writer.puts ' break;'
- end
- writer.puts ' }'
- end
- writer.puts ' }'
- writer.puts ' return 1.0;'
- writer.puts '}'
end
private
def _distance(phoneme1, phoneme2)