# frozen_string_literal: true require_relative '../phonetics' require 'json' module Phonetics class CodeGenerator attr_reader :writer def initialize(writer = STDOUT) @writer = writer end def generate_phonetic_cost_c_code generator = PhoneticCost.new(writer) generator.generate writer.flush end def generate_next_phoneme_length_c_code generator = NextPhonemeLength.new(writer) generator.generate writer.flush end private # Turn the bytes of all phonemes into a lookup trie where a sequence of # bytes can find a phoneme in linear time. def phoneme_byte_trie phoneme_byte_trie_for(Phonetics.phonemes) end def phoneme_byte_trie_for(phonemes) phonemes.each_with_object({}) do |phoneme, trie| phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)| subtrie[byte] ||= {} # If we've reached the end of the byte string if phoneme.bytes.length - 1 == idx # Check if this is a duplicate lookup path. If there's a collision # then this whole approach makes no sense. if subtrie[byte].key?(:source) source = subtrie[byte][:source] raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})" else subtrie[byte][:source] = phoneme end end subtrie[byte] end end end def ruby_source location = caller_locations.first "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}" end def describe(phoneme, depth) indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}" if Phonetics::Consonants.features.key?(phoneme) indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}" else indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}" end end def indent(depth, line) write " #{' ' * depth}#{line}" end def write(line) writer.puts line end end class PhoneticCost < CodeGenerator # We find the phonetic distance between two phonemes using a compiled # lookup table. This is implemented as a set of nested switch statements. # Hard to read when compiled, but simple to generate and fast at runtime. # # We generate a `phonetic_cost` function that takes four arguments: Two # strings, and the lengths of those strings. Each string should be exactly # one valid phoneme, which is possible thanks to the (also generated) # next_phoneme_length() function. # # This will print a C code file with a function that implements a multil-level C # switch like the following: # # switch (phoneme1_length) { # case 2: # switch(string1[1]) { # case 201: // first byte of "ɪ" # switch(string1[3]) { # case 170: // second and final byte of "ɪ" # // Phoneme: "ɪ", bytes: [201, 170] # // vowel features: {"F1":300,"F2":2100,"rounded":false} # switch(string2[6]) { # case 105: // first and only byte of "i" # // Phoneme: "i", bytes: [105] # // vowel features: {"F1":240,"F2":2400,"rounded":false} # return (float) 0.14355381904337383; # break; # # the distance of ("ɪ", "i")2 is therefore 0.14355 # def generate write(<<-HEADER.gsub(/^ {6}/, '')) // This is compiled from Ruby, in #{ruby_source} #include #include #include "./phonemes.h" float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) { HEADER write ' switch (phoneme1_length) {' by_byte_length.each do |length, phonemes| write " case #{length}:" switch_phoneme1(phoneme_byte_trie_for(phonemes), 0) write ' break;' end write ' }' write ' return (float) 1.0;' write '};' write '' end def switch_phoneme1(trie, depth = 0) indent depth, "switch(string1[string1_offset + #{depth}]) {" trie.each do |key, subtrie| next if key == :source next if subtrie.empty? indent depth + 1, "case #{key}:" phoneme1 = subtrie[:source] # If this could be a match of a phoneme1 then find phoneme2 if phoneme1 # Add a comment to help understand the dataset describe(phoneme1, depth + 2) if phoneme1 by_byte_length.each do |_, phonemes| byte_trie = phoneme_byte_trie_for(phonemes) next if byte_trie.empty? switch_phoneme2(byte_trie, phoneme1, 0) end else switch_phoneme1(subtrie, depth + 1) end indent depth + 2, 'break;' end indent depth, '}' end def switch_phoneme2(trie, previous_phoneme, depth = 0) indent depth, "switch(string2[string2_offset + #{depth}]) {" trie.each do |key, subtrie| next if key == :source next if subtrie.empty? phoneme2 = subtrie[:source] indent depth + 1, "case #{key}:" if phoneme2 value = if previous_phoneme == phoneme2 0.0 else distance(previous_phoneme, phoneme2) end # Add a comment to help understand the dataset describe(phoneme2, depth + 2) indent depth + 2, "return (float) #{value};" else switch_phoneme2(subtrie, previous_phoneme, depth + 1) end indent depth + 2, 'break;' end indent depth, '}' end def by_byte_length Phonetics.phonemes.group_by do |phoneme| phoneme.bytes.length end.sort_by(&:first) end def distance(p1, p2) Phonetics.distance_map[p1][p2] end end class NextPhonemeLength < CodeGenerator # There's no simple way to break a string of IPA characters into phonemes. # We do it by generating a function that, given a string of IPA characters, # the starting index in that string, and the length of the string, returns # the length of the next phoneme, or zero if none is found. # # Pseudocode: # - return 0 if length - index == 0 # - switch on first byte, matching on possible first bytes of phonemes # within the selected case statement: # - return 1 if length - index == 1 # - switch on second byte, matching on possible second bytes of phonemes # within the selected case statement: # - return 2 if length - index == 1 # ... # - default case: return 2 iff a phoneme terminates here # - default case: return 1 iff a phoneme terminates here # - return 0 # def generate write(<<-HEADER.gsub(/^ {6}/, '')) // This is compiled from Ruby, in #{ruby_source} int next_phoneme_length(int *string, int cursor, int length) { int max_length; max_length = length - cursor; HEADER next_phoneme_switch(phoneme_byte_trie, 0) # If we fell through all the cases, return 0 write ' return 0;' write '}' end private # Recursively build switch statements for the body of next_phoneme_length def next_phoneme_switch(trie, depth) # switch (string[cursor + depth]) { # case N: // for N in subtrie.keys # // if a case statement matches the current byte AND there's chance # // that a longer string might match, recurse. # if (max_length >= depth) { # // recurse # } # break; # // if there's a :source key here then a phoneme terminates at this # // point and this depth is a valid return value. # default: # return depth; # break; # } indent depth, "switch(string[cursor + #{depth}]) {" write '' trie.each do |key, subtrie| next if key == :source next if subtrie.empty? indent depth, "case #{key}:" # Add a comment to help understand the dataset describe(subtrie[:source], depth + 1) if subtrie[:source] if subtrie.keys == [:source] indent depth, " return #{depth + 1};" else indent depth, " if (max_length > #{depth + 1}) {" next_phoneme_switch(subtrie, depth + 1) indent depth, ' } else {' indent depth, " return #{depth + 1};" indent depth, ' }' end indent depth, ' break;' end if trie.key?(:source) indent depth, ' default:' indent depth, " return #{depth};" end indent depth, '}' end end end