Sha256: a36151b26b6c895c23db1094f44499a0a8f2724bfa05579ff4e566193372c57d
Contents?: true
Size: 1.79 KB
Versions: 4
Compression:
Stored size: 1.79 KB
Contents
module Ebooks class MarkovModel INTERIM = :interim # Special token marking newline/^/$ boundaries attr_accessor :tokens attr_reader :depth def represent(token) if token.nil? || token == "\n" || token.empty? INTERIM else token end end def consume(tokenized, depth=2) @tokens = [INTERIM] @depth = depth tokenized.each do |tokens| @tokens += tokens @tokens << INTERIM end @model = {} @tokens.each_with_index do |token, i| prev_tokens = [] @depth.downto(1) do |j| if i-j < 0; next else; prev = represent(@tokens[i-j]) end prev_tokens << prev end 1.upto(@depth) do |j| break if j > prev_tokens.length ngram = prev_tokens.last(j) unless ngram == INTERIM && prev_tokens[-1] == INTERIM @model[ngram] ||= [] @model[ngram] << represent(token) end end end self end def chain(tokens) next_token = nil @depth.downto(1).each do |i| next if tokens.length < i matches = @model[tokens.last(i)] if matches #p tokens.last(i) #puts "=> #{matches.inspect}" next_token = matches.sample break end end raise ArgumentError if next_token.nil? if next_token == INTERIM return tokens else return chain(tokens + [next_token]) end end def generate tokens = chain([@model[[INTERIM]].sample]) NLP.reconstruct(tokens) end def serialize { 'model' => @model, 'depth' => @depth } end def deserialize(data) @model = data['model'] @depth = data['depth'] self end end end
Version data entries
4 entries across 4 versions & 1 rubygems