lib/stamina/sample.rb in stamina-0.3.1 vs lib/stamina/sample.rb in stamina-0.4.0
- old
+ new
@@ -26,13 +26,14 @@
def self.[](*args) Sample.new << args end
#
# Creates an empty sample.
#
- def initialize()
+ def initialize(strings = nil)
@strings = []
@size, @positive_count, @negative_count = 0, 0, 0
+ strings.each{|s| self << s } unless strings.nil?
end
#
# Returns true if this sample does not contain any string,
# false otherwise.
@@ -173,18 +174,94 @@
each do |str|
signature << (str.unlabeled? ? '?' : str.positive? ? '1' : '0')
end
signature
end
+
+ #
+ # Takes only a given proportion of this sample and returns it as a new Sample.
+ #
+ def take(proportion = 0.5)
+ taken = Stamina::Sample.new
+ each_positive{|s| taken << s if Kernel.rand < proportion}
+ each_negative{|s| taken << s if Kernel.rand < proportion}
+ taken
+ end
#
# Prints an ADL description of this sample on the buffer.
#
def to_adl(buffer="")
self.inject(buffer) {|memo,str| memo << "\n" << str.to_adl}
end
alias :to_s :to_adl
alias :inspect :to_adl
+
+ #
+ # Converts a Sample to an (augmented) prefix tree acceptor. This method ensures
+ # that the states of the PTA are in lexical order, according to the <code><=></code>
+ # operator defined on symbols. States reached by negative strings are tagged as
+ # non accepting and error.
+ #
+ def self.to_pta(sample)
+ thepta = Automaton.new do |pta|
+ initial_state = add_state(:initial => true, :accepting => false)
+
+ # Fill the PTA with each string
+ sample.each do |str|
+ # split string using the dfa
+ parsed, reached, remaining = pta.dfa_split(str, initial_state)
- end # class Sample
+ # remaining symbols are not empty -> build the PTA
+ unless remaining.empty?
+ remaining.each do |symbol|
+ newone = pta.add_state(:initial => false, :accepting => false, :error => false)
+ pta.connect(reached, newone, symbol)
+ reached = newone
+ end
+ end
+
+ # flag state
+ str.positive? ? reached.accepting! : reached.error!
+
+ # check consistency, should not arrive as Sample does not allow
+ # inconsistencies. Should appear only if _sample_ is not a Sample
+ # instance but some other enumerable.
+ raise(InconsistencyError, "Inconsistent sample on #{str}", caller)\
+ if (reached.error? and reached.accepting?)
+ end
+ # Reindex states by applying BFS
+ to_index, index = [initial_state], 0
+ until to_index.empty?
+ state = to_index.shift
+ state[:__index__] = index
+ state.out_edges.sort{|e,f| e.symbol<=>f.symbol}.each{|e| to_index << e.target}
+ index += 1
+ end
+ end
+
+ # Now we rebuild a fresh one with states in order.
+ # This look more efficient that reordering states of the PTA
+ Automaton.new do |ordered|
+ ordered.add_n_states(thepta.state_count)
+ thepta.each_state do |pta_state|
+ source = ordered.ith_state(pta_state[:__index__])
+ source.initial! if pta_state.initial?
+ source.accepting! if pta_state.accepting?
+ source.error! if pta_state.error?
+ pta_state.out_edges.each do |e|
+ target = ordered.ith_state(e.target[:__index__])
+ ordered.connect(source, target, e.symbol)
+ end
+ end
+ end
+
+ end
+
+ # Convenient shortcut for Sample.to_pta(sample_instance)
+ def to_pta
+ Sample.to_pta(self)
+ end
+
+ end # class Sample
end # module Stamina