# encoding: utf-8 module Ebooks # This generator uses data identical to the markov model, but # instead of making a chain by looking up bigrams it uses the # positions to randomly replace suffixes in one sentence with # matching suffixes in another class SuffixGenerator def self.build(sentences) SuffixGenerator.new(sentences) end def initialize(sentences) @sentences = sentences.reject { |s| s.length < 2 } @unigrams = {} @bigrams = {} @sentences.each_with_index do |tikis, i| last_tiki = INTERIM tikis.each_with_index do |tiki, j| @unigrams[last_tiki] ||= [] @unigrams[last_tiki] << [i, j] @bigrams[last_tiki] ||= {} @bigrams[last_tiki][tiki] ||= [] if j == tikis.length-1 # Mark sentence endings @unigrams[tiki] ||= [] @unigrams[tiki] << [i, INTERIM] @bigrams[last_tiki][tiki] << [i, INTERIM] else @bigrams[last_tiki][tiki] << [i, j+1] end last_tiki = tiki end end self end def generate(passes=5, n=:unigrams) index = rand(@sentences.length) tikis = @sentences[index] used = [index] # Sentences we've already used verbatim = [tikis] # Verbatim sentences to avoid reproducing 0.upto(passes-1) do varsites = {} # Map bigram start site => next tiki alternatives tikis.each_with_index do |tiki, i| next_tiki = tikis[i+1] break if next_tiki.nil? alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki] # Filter out suffixes from previous sentences alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) } varsites[i] = alternatives unless alternatives.empty? end variant = nil varsites.to_a.shuffle.each do |site| start = site[0] site[1].shuffle.each do |alt| start, alt = site[0], site[1].sample verbatim << @sentences[alt[0]] suffix = @sentences[alt[0]][alt[1]..-1] potential = tikis[0..start+1] + suffix # Ensure we're not just rebuilding some segment of another sentence unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) } used << alt[0] variant = potential break end end break if variant end tikis = variant if variant end tikis end end end