lib/stamina/sample.rb in stamina-0.3.1 vs lib/stamina/sample.rb in stamina-0.4.0

- old
+ new

@@ -26,13 +26,14 @@ def self.[](*args) Sample.new << args end # # Creates an empty sample. # - def initialize() + def initialize(strings = nil) @strings = [] @size, @positive_count, @negative_count = 0, 0, 0 + strings.each{|s| self << s } unless strings.nil? end # # Returns true if this sample does not contain any string, # false otherwise. @@ -173,18 +174,94 @@ each do |str| signature << (str.unlabeled? ? '?' : str.positive? ? '1' : '0') end signature end + + # + # Takes only a given proportion of this sample and returns it as a new Sample. + # + def take(proportion = 0.5) + taken = Stamina::Sample.new + each_positive{|s| taken << s if Kernel.rand < proportion} + each_negative{|s| taken << s if Kernel.rand < proportion} + taken + end # # Prints an ADL description of this sample on the buffer. # def to_adl(buffer="") self.inject(buffer) {|memo,str| memo << "\n" << str.to_adl} end alias :to_s :to_adl alias :inspect :to_adl + + # + # Converts a Sample to an (augmented) prefix tree acceptor. This method ensures + # that the states of the PTA are in lexical order, according to the <code><=></code> + # operator defined on symbols. States reached by negative strings are tagged as + # non accepting and error. + # + def self.to_pta(sample) + thepta = Automaton.new do |pta| + initial_state = add_state(:initial => true, :accepting => false) + + # Fill the PTA with each string + sample.each do |str| + # split string using the dfa + parsed, reached, remaining = pta.dfa_split(str, initial_state) - end # class Sample + # remaining symbols are not empty -> build the PTA + unless remaining.empty? + remaining.each do |symbol| + newone = pta.add_state(:initial => false, :accepting => false, :error => false) + pta.connect(reached, newone, symbol) + reached = newone + end + end + + # flag state + str.positive? ? reached.accepting! : reached.error! + + # check consistency, should not arrive as Sample does not allow + # inconsistencies. Should appear only if _sample_ is not a Sample + # instance but some other enumerable. + raise(InconsistencyError, "Inconsistent sample on #{str}", caller)\ + if (reached.error? and reached.accepting?) + end + # Reindex states by applying BFS + to_index, index = [initial_state], 0 + until to_index.empty? + state = to_index.shift + state[:__index__] = index + state.out_edges.sort{|e,f| e.symbol<=>f.symbol}.each{|e| to_index << e.target} + index += 1 + end + end + + # Now we rebuild a fresh one with states in order. + # This look more efficient that reordering states of the PTA + Automaton.new do |ordered| + ordered.add_n_states(thepta.state_count) + thepta.each_state do |pta_state| + source = ordered.ith_state(pta_state[:__index__]) + source.initial! if pta_state.initial? + source.accepting! if pta_state.accepting? + source.error! if pta_state.error? + pta_state.out_edges.each do |e| + target = ordered.ith_state(e.target[:__index__]) + ordered.connect(source, target, e.symbol) + end + end + end + + end + + # Convenient shortcut for Sample.to_pta(sample_instance) + def to_pta + Sample.to_pta(self) + end + + end # class Sample end # module Stamina