lib/ssml2mp3/builder.rb in ssml2mp3-0.1.0 vs lib/ssml2mp3/builder.rb in ssml2mp3-0.1.1

- old
+ new

@@ -1,9 +1,8 @@ require "aws-sdk-polly" require "logger" require "nokogiri" -require "htmlentities" require "expeditor" require "concurrent" require "tmpdir" module Ssml2mp3 @@ -24,11 +23,10 @@ executor: Concurrent::ThreadPoolExecutor.new( min_threads: 0, max_threads: @max_threads, ) ) - @htmlentities = HTMLEntities.new end def synthesize_file(ssml_path, mp3_path) basename = File.basename(mp3_path, ".mp3") ssml = File.read(ssml_path) @@ -84,43 +82,46 @@ def split_ssml(ssml) doc = Nokogiri::XML.parse(tweak_ssml(ssml)) elements = doc.root.children header = (%r((.+<speak[^>]+>))m === ssml && $1) - split_ssml_(elements, "", []).map do |body_ssml| - header + body_ssml + "</speak>" - end - end - def split_ssml_(elements, buffer, results) - if elements.empty? - return buffer.size > 0 ? results << buffer : results - end + results = [] + buffer = "" - element = elements.shift + while elements.size > 0 do + element = elements.shift - case element - when Nokogiri::XML::Text - text = @htmlentities.encode(element.text) - when String - text = @htmlentities.encode(element) - else - return split_ssml_(elements, buffer + element.to_s, results) - end + case element + when Nokogiri::XML::Text + text = html_encode(element.text) + when String + text = html_encode(element) + else + buffer += element.to_s + next + end - if text_size(text) > POLLY_TEXT_LENGTH_LIMIT - split_texts = text.chars.each_slice(POLLY_TEXT_LENGTH_LIMIT).map(&:join) - elements = split_texts + elements - return split_ssml_(split_texts + elements, buffer, results) - end + if text.size > POLLY_TEXT_LENGTH_LIMIT + split_texts = text.chars.each_slice(POLLY_TEXT_LENGTH_LIMIT).map(&:join) + elements = split_texts + elements + next + end - if text_size(buffer + text) > POLLY_TEXT_LENGTH_LIMIT - results << buffer - buffer = "" + if text_size(buffer + text) > POLLY_TEXT_LENGTH_LIMIT + results << buffer + buffer = "" + end + + buffer += text end - split_ssml_(elements, buffer + text, results) + results << buffer if buffer.size > 0 + + results.map do |body_ssml| + header + body_ssml + "</speak>" + end end def text_size(text) text.gsub("</?[^>]+>", '').size end @@ -129,8 +130,12 @@ ssml. gsub("\n", ""). gsub("<p>", ""). gsub("</p>", '<break strength="strong"/>'). gsub(/([」】)』])/, '\1<break strength="strong"/>') + end + + def html_encode(text) + text.gsub(/</, "&lt;").gsub(/>/, "&gt;") end end end