module Wukong class Processor # A widget which filters input records according to some # criterion. class Filter < Processor # Process a `record` by yielding it only if it should be # selected by this filter. # # @param [Object] record an input record # @yield [record] yielded if this record should pass the filter # @yieldparam [Object] record # @see #select? # @see #reject? def process(record) yield(record) if select?(record) end # Should the given `record` be passed by this filter? # # @param [Object] record # @return [true, false] # @see #reject? def select?(record) true end # Should the given `record` be rejected by this filter? # # @param [Object] record # @return [true, false] # @see #select? def reject?(record) not select?(record) end register end # A widget which passes all records, i.e. - it acts just like # `cat`. # # @example Pass all records unmodified on the command line # # $ cat input # 1 # 2 # 3 # $ cat input | wu-local identity # 1 # 2 # 3 # # @example Pass all records unmodified in a dataflow # # Wukong.dataflow(:uses_identity) do # ... | identity | ... # end # # @see Filter # @see Null class Identity < Filter register end # A widget which doesn't pass any records, i.e. - it acts just # like /dev/null. # # @example Filter all records on the command line # # $ cat input # 1 # 2 # 3 # $ cat input | wu-local null # # @example Filter all records from a dataflow # # Wukong.dataflow(:uses_null) do # ... | null | ... # end # # @see Filter # @see All class Null < Filter # Prevents any records from passing because it always returns # `false`. # # @param [Object] record # @return false def select? record false end register end # A widget which only passes records if they match a regular # expression. # # @example Passing records which match a given expression on the command-line # # $ cat input # apple # banana # cat # $ cat input | wu-local regexp --match='^a' # apple # # @example Passing records which match a given expression in a dataflow # # Wukong.dataflow(:uses_regexp) do # ... | regexp(match: /^a/) | ... # end # # @see Filter # @see NotRegexpFilter class RegexpFilter < Filter # The regular expression to use to match records. field :match, Regexp # Selects a `record` only if it matches this widget's `match` # field. # # @param [Object] record # @return [true, false] def select?(record) return true unless match match =~ record.to_s end register(:regexp) end # A widget which only passes records if they *don't* match a # regular expression. # # @example Passing records which don't match a given expression on the command-line # # $ cat input # apple # banana # cat # $ cat input | wu-local not_regexp --match='^a' # banana # cat # # @example Passing records which don't match a given expression in a dataflow # # Wukong.dataflow(:uses_not_regexp) do # ... | not_regexp(match: /^a/) | ... # end # # @see Filter # @see NotRegexpFilter class NotRegexpFilter < RegexpFilter # Select a `record` only if it doesn't match this # widget's `match` field. # # @param [Object] record # @return [true, false] def select?(record) return true unless match not match =~ record.to_s end register(:not_regexp) end # A widget which only lets a certain number of records through. # # @example Letting the first 3 records through on the command-line # # $ cat input # 1 # 2 # 3 # 4 # $ cat input | wu-local limit --max=3 # 1 # 2 # 3 # # @example Letting the first 3 records through in a dataflow # # Wukong.dataflow(:uses_limit) do # ... | limit(max: 3) | ... # end # # @see Filter class Limit < Filter # The maximum number of records to let pass. field :max, Integer, :default => Float::INFINITY # The current record count. attr_accessor :count # Initializes the record count to zero. def setup self.count = 0 end # Select a record only if we're below the max count. Increments # the count for this widget. # # @param [Object] record # @return [true, false] def select?(record) keep = @count < max @count += 1 keep end register end # A widget which samples a certain fraction of input records. # # @example Sampling records on the command line # # $ cat input # 1 # 2 # 3 # 4 # $ cat input | wu-local sample --fraction=0.5 # 1 # 3 # # @example Sampling records in a dataflow # # Wukong.dataflow(:uses_sample) do # ... | sample(fraction: 0.5) ... # end # # @see Filter # @see Limit class Sample < Filter # The fraction of records to let pass. Must be between 0.0 and # 10.0 field :fraction, Float, :default => 1.0 # Selects a `record` randomly, with a probability given the the # `fraction` for this widget. # # @param [Object] record # @return [true, false] def select?(record) rand() < fraction end register end # A widget useful for creating filters on the fly in a dataflow. # # When writing a filtering processor out as a class, just use the # DSL for creating processors: # # @example Creating a select filter the usual way # # Wukong.processor(:my_filter, Wukong::Processor::Filter) do # def select? record # record.length > 3 # end # end # # When in a dataflow, sometimes it's easier to create a processor # like this on the fly. # # @example Creating a select filter on the fly in a dataflow # # Wukong.dataflow(:my_flow) do # ... | select { |record| record.length > 3 } | ... # end # # @see Filter # @see Reject class Select < Filter # Selects the given `record` by delegating to the # `perform_action` method, which will automatically be # populating by the block used to create this filter in the # dataflow DSL. # # @param [Object] record # @return [true, false] # @see Processor#perform_action def select?(record) perform_action(record) end register end # A widget useful for creating filters on the fly in a dataflow. # # @see Select class Reject < Filter # Rejects the given `record` by delegating to the # `perform_action` method. # # @param [Object] record # @return [true, false] # @see Processor#perform_action def select?(record) not perform_action(record) end register end end end