lib/huge_enumerable.rb in huge_enumerable-0.0.1 vs lib/huge_enumerable.rb in huge_enumerable-0.0.2

- old
+ new

@@ -1,357 +1,362 @@ -require "huge_enumerable/version" - -require 'backports' if RUBY_VERSION < '1.9' -require 'prime' -require 'prime_miller_rabin' - -Prime::MillerRabin.speed_intercept - -# HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite) -# data sets without requiring them to be in memory. -# In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were -# an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original -# data set in any fashion. -# -# To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch. -# collection_size should return the size of the full data set. -# fetch should return the value at the given index. -# It is guaranteed that fetch will always be called with values in the range of (0...collection_size) -# It will never be called with a negative index or with an index >= collection_size -class HugeEnumerable - - include Enumerable - - # Currently 100,000 elements - DEFAULT_MAX_ARRAY_SIZE=100000 - - # The maximum number of elements to be returned when to_a is called. - # If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller. - attr_accessor :max_array_size - - # The random number generator to use for shuffles and samples. Defaults to self#rand. - attr_accessor :rng - - # Create a new HugeEnumerable - # - # ==== Options - # - # * +:max_array_size+ - The default size of arrays when #to_a is called. - # * +:rng+ - The random number generator to use. - def initialize(max_array_size = nil, rng = nil) - @max_array_size = max_array_size ? max_array_size.to_i : nil - @rng = rng || self.method(:rand) - @collection_increment = 1 - @start_of_sequence = 0 - @shuffle_head = 0 - end - - # Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices. - # Negative indices count backward from the end of the collection (-1 is the last element). - # For start and range cases the starting index is just before an element. - # Additionally, an empty array is returned when the starting index for an element range is at the end of the collection. - # Returns nil if the index (or starting index) are out of range. - # ==== Attributes - # - # * +index_or_range+ - Either an integer for single element selection or length selection, or a range. - # - # ==== Options - # - # * +:length+ - The number of elements to return if index_or_range is not a range. - def [](index_or_range, length=nil) - # TODO: Consider changing this to return HugeCollection - if index_or_range.is_a?(Range) - range = index_or_range - index = nil - else - index = index_or_range.to_i - range = nil - end - - if range - index = range.first - index += size if index < 0 - - length = range.last - index + 1 - length += size if range.last < 0 - length = size - index if index + length > size - - if index < 0 || index > size - nil - elsif length < 0 - [] - else - element_or_array(length) { |i| _fetch(i + index) } - end - elsif length - index += size if index < 0 - length = size - index if index + length > size - if index < 0 || length < 0 - nil - else - element_or_array(length) { |i| _fetch(i + index) } - end - else - _fetch(index) - end - - end - - # Calls the given block once for each element remaining in the collection, passing that element as a parameter. - def collection_each(&block) # :yields: element - # TODO: Return an Enumerator if no block is given - size.times { |i| yield _fetch(i) } - end - - # When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself. - # If no block is given, an HugeCombination is returned instead. - # === Caveat - # max_array_size is currently inherited by the generated HugeCombination. This may change in the future. - def combination(n) # :yields: element - random_number_generator = rng != self.method(:rand) ? rng : nil - combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator) - if block_given? - combo.each { |x| yield x } - self - else - combo - end - end - - # Calls the given block once for each element in the next array of the collection, passing that element as a parameter. - def each # :yields: element - # TODO: Return an Enumerator if no block is given - remaining_or(max_array_size).times { |i| yield _fetch(i) } - end - - def max_array_size #:nodoc: - @max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min - end - - # Shifts max_array_size elements and returns the following array from to_a. - def next_array - shift(max_array_size) - to_a - end - - # Returns true of the collection contains no more elements. - def empty? - @start_of_sequence == @end_of_sequence - end - - # When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself. - # If no block is given, a HugePermutation is returned instead. - # === Caveat - # max_array_size is currently inherited by the generated HugePermutation. This may change in the future. - def permutation(n) # :yields: element - random_number_generator = rng != self.method(:rand) ? rng : nil - perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator) - if block_given? - perm.each { |x| yield x } - self - else - perm - end - end - - # Removes the last element from the collection and returns it, or nil if the collection is empty. - # If a number n is given, returns an array of the last n elements (or less). - def pop(n = nil) - result = element_or_array(n) { pop1 } - n ? result.reverse : result - end - - # When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself. - # If no block is given, a HugeProduct is returned instead. - # === Caveat - # max_array_size is currently inherited by the generated HugeProduct. This may change in the future. - # other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future. - def product(other_enumerable) # :yields: element - other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable) - random_number_generator = rng != self.method(:rand) ? rng : nil - prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator) - if block_given? - prod.each { |x| yield x } - self - else - prod - end - end - - # Choose a random element or n random elements from the collection. - # The elements are chosen by using random and unique indices into the array in order to ensure - # that an element does not repeat itself unless the collection already contained duplicate elements. - # If the collection is empty the first form returns nil and the second form returns an empty array. - # The optional rng argument will be used as the random number generator. - def sample(*args) - if args.size > 2 - raise ArgumentError, "wrong number of arguments (#{args.size} for 2)" - elsif args.size == 2 - n = args.first - rng = args.last - elsif args.size == 1 - arg = args.first - if arg.is_a?(Proc) || arg.is_a?(Method) - n = 1 - rng = arg - else - n = arg - rng = method(:rand) - end - else - n = nil - rng = method(:rand) - end - - element_or_array(n) { sample1(rng) } - end - - # Removes the first element of the collection and returns it (shifting all other elements down by one). - # Returns nil if the collection is empty. - # If a number n is given, returns an array of the first n elements (or less). - # With collection containing only the remainder elements, not including what was shifted to returned array. - # ==== Options - # * +rng+ - The random number generator to use. Defaults to self#rng. - def shift(n = nil) - element_or_array(n) { shift1 } - end - - # Returns a new HugeEnumerable with the order of the elements of the new collection randomized. - # ==== Options - # * +rng+ - The random number generator to use. Defaults to self#rng. - # ==== Side Effects - # The new collection is reset to the current collection's original size and elements before shuffling. - def shuffle(rng=nil) - self.dup.shuffle!(rng) - end - - # Randomly reorders the elements of the collection. - # ==== Options - # * +rng+ - The random number generator to use. Defaults to self#rng. - # ==== Side Effects - # The collection is reset to its original size and elements before shuffling - def shuffle!(rng=nil) - rng ||= self.rng - reset! - @shuffle_head = rng.call(collection_size) - @collection_increment = full_cycle_increment(collection_size) - self - end - - # Returns the current size of the collection. - # Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array. - def size - end_of_sequence - start_of_sequence - end - - protected - - def reset! - @start_of_sequence = 0 - @end_of_sequence = nil - self - end - - private - - attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment - - def collection_size - raise NotImplementedError, "not implemented for #{self.class.name}" - end - - def end_of_sequence - @end_of_sequence ||= collection_size - end - - def fetch(x) - raise NotImplementedError, "not implemented for #{self.class.name}" - end - - def miller_rabin - @miller_rabin ||= Prime::MillerRabin.new - end - - def next_prime(x) - if x < 2 - 2 - elsif x < 3 - 3 - elsif x < 5 - 5 - else - x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 )) - x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin) - x - end - end - - def pop1 - result = _fetch(end_of_sequence - start_of_sequence - 1) - @end_of_sequence -= 1 - result - end - - def remaining_or(x) - [x, size].min - end - - def shuffle_index(index) - index ? (shuffle_head + collection_increment * index) % collection_size : nil - end - - def relative_index(index) - index = end_of_sequence + index if index < 0 - index += start_of_sequence - index >= 0 && index < end_of_sequence ? index : nil - end - - def shift1 - result = _fetch(0) - @start_of_sequence += 1 - result - end - - def _fetch(index) - index = shuffle_index(relative_index(index)) - index ? fetch(index) : nil - end - - def sample1(rng) - if @sample_position.nil? || @sample_position >= size - @sample_position = rng.call(size) - else - if @last_sample_size != size - @last_sample_size = size - @sample_increment = full_cycle_increment(size) - end - @sample_position = (@sample_position + @sample_increment) % size - end - _fetch(@sample_position) - end - - def full_cycle_increment(domain_size) - increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i) - increment == domain_size ? next_prime(increment + 1) : increment - end - - def element_or_array(n = nil) - unless n.nil? - n = n.to_i - raise ArgumentError, 'negative array size' if n < 0 - end - unless empty? - n ? (0...remaining_or(n)).map { |x| yield(x) } : yield - else - n.nil? ? nil : [] - end - end - -end - -require 'huge_enumerable/huge_collection' -require 'huge_enumerable/huge_combination' -require 'huge_enumerable/huge_permutation' -require 'huge_enumerable/huge_product' - - - +require "huge_enumerable/version" + +require 'backports' if RUBY_VERSION < '1.9' +require 'prime' +require 'prime_miller_rabin' + +Prime::MillerRabin.speed_intercept + +# HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite) +# data sets without requiring them to be in memory. +# In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were +# an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original +# data set in any fashion. +# +# To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch. +# collection_size should return the size of the full data set. +# fetch should return the value at the given index. +# It is guaranteed that fetch will always be called with values in the range of (0...collection_size) +# It will never be called with a negative index or with an index >= collection_size +class HugeEnumerable + + include Enumerable + + # Currently 100,000 elements + DEFAULT_MAX_ARRAY_SIZE=100000 + + # The maximum number of elements to be returned when to_a is called. + # If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller. + attr_accessor :max_array_size + + # The random number generator to use for shuffles and samples. Defaults to self#rand. + attr_accessor :rng + + # Create a new HugeEnumerable + # + # ==== Options + # + # * +:max_array_size+ - The default size of arrays when #to_a is called. + # * +:rng+ - The random number generator to use. + def initialize(max_array_size = nil, rng = nil) + @max_array_size = max_array_size ? max_array_size.to_i : nil + @rng = rng || self.method(:rand) + @collection_increment = 1 + @start_of_sequence = 0 + @shuffle_head = 0 + end + + # Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices. + # Negative indices count backward from the end of the collection (-1 is the last element). + # For start and range cases the starting index is just before an element. + # Additionally, an empty array is returned when the starting index for an element range is at the end of the collection. + # Returns nil if the index (or starting index) are out of range. + # ==== Attributes + # + # * +index_or_range+ - Either an integer for single element selection or length selection, or a range. + # + # ==== Options + # + # * +:length+ - The number of elements to return if index_or_range is not a range. + def [](index_or_range, length=nil) + # TODO: Consider changing this to return HugeCollection + if index_or_range.is_a?(Range) + range = index_or_range + index = nil + else + index = index_or_range.to_i + range = nil + end + + if range + index = range.first + index += size if index < 0 + + length = range.last - index + 1 + length += size if range.last < 0 + length = size - index if index + length > size + + if index < 0 || index > size + nil + elsif length < 0 + [] + else + element_or_array(length) { |i| _fetch(i + index) } + end + elsif length + index += size if index < 0 + length = size - index if index + length > size + if index < 0 || length < 0 + nil + else + element_or_array(length) { |i| _fetch(i + index) } + end + else + _fetch(index) + end + + end + + # Calls the given block once for each element remaining in the collection, passing that element as a parameter. + def collection_each(&block) # :yields: element + # TODO: Return an Enumerator if no block is given + size.times { |i| yield _fetch(i) } + end + + # When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself. + # If no block is given, an HugeCombination is returned instead. + # === Caveat + # max_array_size is currently inherited by the generated HugeCombination. This may change in the future. + def combination(n) # :yields: element + random_number_generator = rng != self.method(:rand) ? rng : nil + combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator) + if block_given? + combo.each { |x| yield x } + self + else + combo + end + end + + # Calls the given block once for each element in the next array of the collection, passing that element as a parameter. + def each # :yields: element + # TODO: Return an Enumerator if no block is given + remaining_or(max_array_size).times { |i| yield _fetch(i) } + end + + def max_array_size #:nodoc: + @max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min + end + + # Shifts max_array_size elements and returns the following array from to_a. + def next_array + shift(max_array_size) + to_a + end + + # Returns true of the collection contains no more elements. + def empty? + @start_of_sequence == @end_of_sequence + end + + # When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself. + # If no block is given, a HugePermutation is returned instead. + # === Caveat + # max_array_size is currently inherited by the generated HugePermutation. This may change in the future. + def permutation(n) # :yields: element + random_number_generator = rng != self.method(:rand) ? rng : nil + perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator) + if block_given? + perm.each { |x| yield x } + self + else + perm + end + end + + # Removes the last element from the collection and returns it, or nil if the collection is empty. + # If a number n is given, returns an array of the last n elements (or less). + def pop(n = nil) + result = element_or_array(n) { pop1 } + n ? result.reverse : result + end + + # When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself. + # If no block is given, a HugeProduct is returned instead. + # === Caveat + # max_array_size is currently inherited by the generated HugeProduct. This may change in the future. + # other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future. + def product(other_enumerable) # :yields: element + other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable) + random_number_generator = rng != self.method(:rand) ? rng : nil + prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator) + if block_given? + prod.each { |x| yield x } + self + else + prod + end + end + + # Choose a random element or n random elements from the collection. + # The elements are chosen by using random and unique indices into the array in order to ensure + # that an element does not repeat itself unless the collection already contained duplicate elements. + # If the collection is empty the first form returns nil and the second form returns an empty array. + # The optional rng argument will be used as the random number generator. + def sample(*args) + if args.size > 2 + raise ArgumentError, "wrong number of arguments (#{args.size} for 2)" + elsif args.size == 2 + n = args.first + rng = args.last + elsif args.size == 1 + arg = args.first + if arg.is_a?(Proc) || arg.is_a?(Method) + n = 1 + rng = arg + else + n = arg + rng = method(:rand) + end + else + n = nil + rng = method(:rand) + end + + element_or_array(n) { sample1(rng) } + end + + # Removes the first element of the collection and returns it (shifting all other elements down by one). + # Returns nil if the collection is empty. + # If a number n is given, returns an array of the first n elements (or less). + # With collection containing only the remainder elements, not including what was shifted to returned array. + # ==== Options + # * +rng+ - The random number generator to use. Defaults to self#rng. + def shift(n = nil) + element_or_array(n) { shift1 } + end + + # Returns a new HugeEnumerable with the order of the elements of the new collection randomized. + # ==== Options + # * +rng+ - The random number generator to use. Defaults to self#rng. + # ==== Side Effects + # The new collection is reset to the current collection's original size and elements before shuffling. + def shuffle(rng=nil) + self.dup.shuffle!(rng) + end + + # Randomly reorders the elements of the collection. + # ==== Options + # * +rng+ - The random number generator to use. Defaults to self#rng. + # ==== Side Effects + # The collection is reset to its original size and elements before shuffling + def shuffle!(rng=nil) + rng ||= self.rng + reset! + @shuffle_head = rng.call(collection_size) + @collection_increment = full_cycle_increment(collection_size) + self + end + + # Returns the current size of the collection. + # Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array. + def size + end_of_sequence - start_of_sequence + end + + protected + + def reset! + @start_of_sequence = 0 + @end_of_sequence = nil + self + end + + private + + attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment + + def collection_size + raise NotImplementedError, "not implemented for #{self.class.name}" + end + + def end_of_sequence + @end_of_sequence ||= collection_size + end + + def fetch(x) + raise NotImplementedError, "not implemented for #{self.class.name}" + end + + def miller_rabin + @miller_rabin ||= Prime::MillerRabin.new + end + + def next_prime(x) + if x < 2 + 2 + elsif x < 3 + 3 + elsif x < 5 + 5 + else + x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 )) + x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin) + x + end + end + + def pop1 + result = _fetch(end_of_sequence - start_of_sequence - 1) + @end_of_sequence -= 1 + result + end + + def remaining_or(x) + [x, size].min + end + + def shuffle_index(index) + index ? (shuffle_head + collection_increment * index) % collection_size : nil + end + + def relative_index(index) + index = end_of_sequence + index if index < 0 + index += start_of_sequence + index >= 0 && index < end_of_sequence ? index : nil + end + + def shift1 + result = _fetch(0) + @start_of_sequence += 1 + result + end + + def _fetch(index) + index = shuffle_index(relative_index(index)) + index ? fetch(index) : nil + end + + def sample1(rng) + if @sample_position.nil? || @sample_position >= size + @sample_position = rng.call(size) + else + if @last_sample_size != size + @last_sample_size = size + @sample_increment = full_cycle_increment(size) + end + @sample_position = (@sample_position + @sample_increment) % size + end + _fetch(@sample_position) + end + + def full_cycle_increment(domain_size) + increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i) + increment == domain_size ? next_prime(increment + 1) : increment + end + + def factorial(x) + x == 0 ? 1 : (1..x).reduce(:*) + end + + + def element_or_array(n = nil) + unless n.nil? + n = n.to_i + raise ArgumentError, 'negative array size' if n < 0 + end + unless empty? + n ? (0...remaining_or(n)).map { |x| yield(x) } : yield + else + n.nil? ? nil : [] + end + end + +end + +require 'huge_enumerable/huge_collection' +require 'huge_enumerable/huge_combination' +require 'huge_enumerable/huge_permutation' +require 'huge_enumerable/huge_product' + + +