lib/alf.rb in alf-0.9.0 vs lib/alf.rb in alf-0.9.1

- old
+ new

@@ -1,7 +1,8 @@ require "enumerator" require "stringio" +require "set" require "alf/version" require "alf/loader" # # Classy data-manipulation dressed in a DSL (+ commandline) @@ -236,11 +237,12 @@ @ordering.find{|arg| arg.first == attr}.last end def compare(t1,t2) @ordering.each do |attr,order| - comp = (t1[attr] <=> t2[attr]) + x, y = t1[attr], t2[attr] + comp = x.respond_to?(:<=>) ? (x <=> y) : (x.to_s <=> y.to_s) comp *= -1 if order == :desc return comp unless comp == 0 end return 0 end @@ -277,119 +279,11 @@ # def self.lispy(env = Alf::Environment.default) Command::Main.new(env) end - # - # Implements a small LISP-like DSL on top of Alf. # - # The lispy dialect is the functional one used in .alf files and in compiled - # expressions as below: - # - # Alf.lispy.compile do - # (restrict :suppliers, lambda{ city == 'London' }) - # end - # - # The DSL this module provides is part of Alf's public API and won't be broken - # without a major version change. The module itself and its inclusion pre- - # conditions are not part of the DSL itself, thus not considered as part of - # the API, and may therefore evolve at any time. In other words, this module - # is not intended to be directly included by third-party classes. - # - module Lispy - - # The environment - attr_accessor :environment - - # - # Compiles a query expression given by a String or a block and returns - # the result (typically a tuple iterator) - # - def compile(expr = nil, &block) - expr.nil? ? instance_eval(&block) : instance_eval(expr) - end - - # Delegated to the environment - def dataset(name) - raise "Environment not set" unless @environment - @environment.dataset(name) - end - - # - # Compiles the subexpression given by the block in the context of - # additional temporary expressions given by definitions - # - def with(definitions) - # We branch with the definitions for compilation - self.environment = environment.branch(definitions) - - # this is to ensure that sub definitions can reuse other - # ones - definitions.each_value do |defn| - defn.environment = self.environment - end - - # compile now - op = compile(&Proc.new) - - # We now unbranch for next expression - self.environment = environment.unbranch - - op - end - - # - # Chains some elements as a new operator - # - def chain(*elements) - elements = elements.reverse - elements[1..-1].inject(elements.first) do |c, elm| - elm.pipe(c, environment) - elm - end - end - - [ :Autonum, :Clip, :Compact, :Defaults, :Sort ].each do |op_name| - meth_name = Tools.ruby_case(op_name).to_sym - define_method(meth_name) do |child, *args| - chain(Operator::NonRelational.const_get(op_name).new(*args), child) - end - end - - [:Project, - :Extend, - :Rename, - :Restrict, - :Nest, - :Unnest, - :Group, - :Ungroup, - :Summarize, - :Quota ].each do |op_name| - meth_name = Tools.ruby_case(op_name).to_sym - define_method(meth_name) do |child, *args| - chain(Operator::Relational.const_get(op_name).new(*args), child) - end - end - - def allbut(child, attributes) - chain(Operator::Relational::Project.new(attributes, true), child) - end - - [ :Join, - :Union, - :Intersect, - :Minus ].each do |op_name| - meth_name = Tools.ruby_case(op_name).to_sym - define_method(meth_name) do |left, right, *args| - chain(Operator::Relational.const_get(op_name).new(*args), [left, right]) - end - end - - end # module Lispy - - # # Encapsulates the interface with the outside world, providing base iterators # for named datasets, among others. # # An environment is typically obtained through the factory defined by this # class: @@ -572,28 +466,39 @@ # # @param [Object] input the iterator input, at discretion of the Iterator # implementation. # @param [Environment] environment an optional environment for resolving # named datasets if needed. + # @return [Object] self # def pipe(input, environment = nil) + self end undef :pipe - # # Coerces something to an iterator # - def self.coerce(arg, env) + def self.coerce(arg, environment = nil) case arg when Iterator, Array arg else - Reader.coerce(arg, env) + Reader.coerce(arg, environment) end end + # + # Converts this iterator to an in-memory Relation. + # + # @return [Relation] a relation instance, as the set of tuples + # that would be yield by this iterator. + # + def to_rel + Relation::coerce(self) + end + end # module Iterator # # Implements an Iterator at the interface with the outside world. # @@ -648,24 +553,29 @@ clazz.new(*args) end end # - # Returns a reader instance for a specific file whose path is given - # as argument. + # When filepath is a String, returns a reader instance for a specific file + # whose path is given as argument. Otherwise, delegate the call to + # <code>coerce(filepath)</code> # # @param [String] filepath path to a file for which extension is recognized # @param [Array] args optional additional arguments that must be passed at # reader's class new method. # @return [Reader] a reader instance # def self.reader(filepath, *args) - ext = File.extname(filepath) - if registered = @@readers.find{|r| r[1].include?(ext)} - registered[2].new(filepath, *args) + if filepath.is_a?(String) + ext = File.extname(filepath) + if registered = @@readers.find{|r| r[1].include?(ext)} + registered[2].new(filepath, *args) + else + raise "No registered reader for #{ext} (#{filepath})" + end else - raise "No registered reader for #{ext} (#{filepath})" + coerce(filepath) end end # # Coerces an argument to a reader, using an optional environment to convert @@ -713,10 +623,11 @@ # # (see Iterator#pipe) # def pipe(input, env = environment) @input = input + self end # # (see Iterator#each) # @@ -732,10 +643,18 @@ end protected # + # Returns the input file path, or nil if this Reader is bound to an IO + # directly. + # + def input_path + input.is_a?(String) ? input : nil + end + + # # Coerces the input object to an IO and yields the block with it. # # StringIO and IO input are yield directly while file paths are first # opened in read mode and then yield. # @@ -820,11 +739,11 @@ # class AlfFile < Reader # (see Reader#each) def each - op = Alf.lispy(environment).compile(input_text) + op = Alf.lispy(environment).compile(input_text, input_path) op.each(&Proc.new) end Reader.register(:alf, [".alf"], self) end # module AlfFile @@ -921,10 +840,11 @@ # This method mimics {Iterator#pipe} and have the same contract. # def pipe(input, env = environment) self.environment = env self.input = input + self end # # Executes the rendering, outputting the resulting tuples on the provided # output buffer. @@ -1031,21 +951,22 @@ # }} # # See '#{program_name} help COMMAND' for details about a specific command. # class Main < Quickl::Delegator(__FILE__, __LINE__) - include Command, Lispy + include Command # Environment instance to use to get base iterators attr_accessor :environment # Output renderer attr_accessor :renderer # Creates a command instance def initialize(env = Environment.default) @environment = env + extend(Lispy) end # Install options options do |opt| @execute = false @@ -1096,11 +1017,11 @@ end # 3) if there is a requester, then we do the job (assuming bin/alf) # with the renderer to use. Otherwise, we simply return built operator if operator && requester - chain(renderer, operator).execute($stdout) + renderer.pipe(operator, environment).execute($stdout) else operator end end @@ -1136,11 +1057,11 @@ end def execute(args) requester.renderer = @renderer args = [ $stdin ] if args.empty? - requester.chain(*args) + requester.send(:chain,*args) end end # class Show # @@ -1196,11 +1117,46 @@ # # Marker for all operators, relational and non-relational ones. # module Operator include Iterator, Tools + + # + # Yields non-relational then relational operators, in turn. + # + def self.each + Operator::NonRelational.each{|x| yield(x)} + Operator::Relational.each{|x| yield(x)} + end + # + # Encapsulates method that allows making operator introspection, that is, + # knowing operator cardinality and similar stuff. + # + module Introspection + + # + # Returns true if this operator is an unary operator, false otherwise + # + def unary? + ancestors.include?(Operator::Unary) + end + + # + # Returns true if this operator is a binary operator, false otherwise + # + def binary? + ancestors.include?(Operator::Binary) + end + + end # module Introspection + + # Ensures that the Introspection module is set on real operators + def self.included(mod) + mod.extend(Introspection) if mod.is_a?(Class) + end + # # Encapsulates method definitions that convert operators to Quickl # commands # module CommandMethods @@ -1236,11 +1192,11 @@ end self end def split_command_args(args) - operands, args = case i = args.index("--") + case (i = args.index("--")) when NilClass [args, []] when 0 [[ $stdin ], args[1..-1]] else @@ -1326,16 +1282,17 @@ # Sets the operator input # def pipe(input, env = environment) self.environment = env self.datasets = [ input ] + self end protected def command_line_operands(operands) - operands.first + operands.first || $stdin end # # Simply returns the first dataset # @@ -1364,10 +1321,11 @@ # Sets the operator input # def pipe(input, env = environment) self.environment = env self.datasets = input + self end protected def command_line_operands(operands) @@ -1459,10 +1417,11 @@ # Sets the operator input # def pipe(input, env = environment) self.environment = env self.datasets = input + self end protected # (see Operator#_each) @@ -1502,30 +1461,46 @@ # # Marker module and namespace for non relational operators # module Operator::NonRelational + # + # Yields the block with each operator module in turn + # + def self.each + constants.each do |c| + val = const_get(c) + yield(val) if val.ancestors.include?(Operator::NonRelational) + end + end + # - # Extend with an unique autonumber attribute + # Extend its operand with an unique autonumber attribute # # SYNOPSIS - # #{program_name} #{command_name} [OPERAND] -- [ATTRNAME] # - # API & EXAMPLE + # #{program_name} #{command_name} [OPERAND] -- [ATTRNAME] # - # # Autonumber suppliers (:autonum attribute name by default) - # (autonum :suppliers) + # DESCRIPTION # - # # You can specify the attribute name + # This non-relational operator guarantees uniqueness of output tuples by + # adding an attribute called 'ATTRNAME' whose value is an Integer. No + # guarantee is given about ordering of output tuples, nor to the fact + # that this autonumber is sequential. Only that all values are different. + # If the presence of duplicates was the only "non-relational" aspect of + # input tuples, the result may be considered a valid relation representation. + # + # IN RUBY + # + # (autonum OPERAND, ATTRNAME = :autonum) + # + # (autonum :suppliers) # (autonum :suppliers, :unique_id) # - # DESCRIPTION + # IN SHELL # - # This operator takes input tuples in any order they come and extends them - # with an autonumber attribute ATTRNAME. This allows converting non-relational - # tuple enumerators to relational ones by ensuring uniqueness of tuples in an - # arbitrary manner. + # #{program_name} #{command_name} [OPERAND] -- [ATTRNAME] # # alf autonum suppliers # alf autonum suppliers -- unique_id # class Autonum < Factory::Operator(__FILE__, __LINE__) @@ -1843,10 +1818,19 @@ # Marker module and namespace for relational operators # module Operator::Relational # + # Yields the block with each operator module in turn + # + def self.each + constants.each do |c| + val = const_get(c) + yield(val) if val.ancestors.include?(Operator::Relational) + end + end + # Relational projection (clip + compact) # # SYNOPSIS # #{program_name} #{command_name} [OPERAND] -- ATTR1 ATTR2 ... # @@ -2307,39 +2291,39 @@ end end # class Union # - # Relational nesting (tuple-valued attributes) + # Relational wraping (tuple-valued attributes) # # SYNOPSIS # #{program_name} #{command_name} [OPERAND] -- ATTR1 ATTR2 ... NEWNAME # # API & EXAMPLE # - # (nest :suppliers, [:city, :status], :loc_and_status) + # (wrap :suppliers, [:city, :status], :loc_and_status) # # DESCRIPTION # - # This operator nests attributes ATTR1 to ATTRN as a new, tuple-based - # attribute whose name is NEWNAME. When used in shell, names of nested + # This operator wraps attributes ATTR1 to ATTRN as a new, tuple-based + # attribute whose name is NEWNAME. When used in shell, names of wrapped # attributes are taken from commandline arguments, expected the last one # which defines the new name to use: # - # alf nest suppliers -- city status loc_and_status + # alf wrap suppliers -- city status loc_and_status # - class Nest < Factory::Operator(__FILE__, __LINE__) + class Wrap < Factory::Operator(__FILE__, __LINE__) include Operator::Relational, Operator::Transform - # Array of nesting attributes + # Array of wraping attributes attr_accessor :attributes - # New name for the nested attribute + # New name for the wrapped attribute attr_accessor :as - # Builds a Nest operator instance - def initialize(attributes = [], as = :nested) + # Builds a Wrap operator instance + def initialize(attributes = [], as = :wrapped) @attributes = attributes @as = as end protected @@ -2356,40 +2340,40 @@ others = tuple_collect(tuple.keys - @attributes){|k| [k,tuple[k]] } others[as] = tuple_collect(attributes){|k| [k, tuple[k]] } others end - end # class Nest + end # class Wrap # - # Relational un-nesting (inverse of nest) + # Relational un-wraping (inverse of wrap) # # SYNOPSIS # #{program_name} #{command_name} [OPERAND] -- ATTR # # API & EXAMPLE # - # # Assuming nested = (nest :suppliers, [:city, :status], :loc_and_status) - # (unnest nested, :loc_and_status) + # # Assuming wrapped = (wrap :suppliers, [:city, :status], :loc_and_status) + # (unwrap wrapped, :loc_and_status) # # DESCRIPTION # - # This operator unnests the tuple-valued attribute named ATTR so as to + # This operator unwraps the tuple-valued attribute named ATTR so as to # flatten its pairs with 'upstream' tuple. The latter should be such so that # no name collision occurs. When used in shell, the name of the attribute to - # unnest is taken as the first commandline argument: + # unwrap is taken as the first commandline argument: # - # alf unnest nest -- loc_and_status + # alf unwrap wrap -- loc_and_status # - class Unnest < Factory::Operator(__FILE__, __LINE__) + class Unwrap < Factory::Operator(__FILE__, __LINE__) include Operator::Relational, Operator::Transform - # Name of the attribute to unnest + # Name of the attribute to unwrap attr_accessor :attribute # Builds a Rename operator instance - def initialize(attribute = :nested) + def initialize(attribute = :wrapped) @attribute = attribute end protected @@ -2400,15 +2384,15 @@ end # (see Operator::Transform#_tuple2tuple) def _tuple2tuple(tuple) tuple = tuple.dup - nested = tuple.delete(@attribute) || {} - tuple.merge(nested) + wrapped = tuple.delete(@attribute) || {} + tuple.merge(wrapped) end - end # class Unnest + end # class Unwrap # # Relational grouping (relation-valued attributes) # # SYNOPSIS @@ -2462,21 +2446,21 @@ end # See Operator#_prepare def _prepare pkey = ProjectionKey.new(attributes, !allbut) - @index = Hash.new{|h,k| h[k] = []} + @index = Hash.new{|h,k| h[k] = Set.new} each_input_tuple do |tuple| key, rest = pkey.split(tuple) @index[key] << rest end end # See Operator#_each def _each @index.each_pair do |k,v| - yield(k.merge(@as => v)) + yield(k.merge(@as => Relation.coerce(v))) end end end # class Group @@ -2535,20 +2519,24 @@ # # Relational summarization (group-by + aggregate ops) # # SYNOPSIS - # #{program_name} #{command_name} [OPERAND] --by=KEY1,KEY2... -- AGG1 EXPR1... + # #{program_name} #{command_name} [OPERAND] [--allbut] --by=KEY1,KEY2... -- AGG1 EXPR1... # # OPTIONS # #{summarized_options} # # API & EXAMPLE # # (summarize :supplies, [:sid], # :total_qty => Aggregator.sum(:qty)) # + # # Or, to specify an allbut projection + # (summarize :supplies, [:qty, :pid], + # :total_qty => Aggregator.sum(:qty), true) + # # DESCRIPTION # # This operator summarizes input tuples on the projection on KEY1,KEY2,... # attributes and applies aggregate operators on sets of matching tuples. # Introduced names AGG should be disjoint from KEY attributes. @@ -2556,30 +2544,38 @@ # When used in shell, the aggregations are taken from commandline arguments # AGG and EXPR, where AGG is the name of a new attribute and EXPR is an # aggregation expression evaluated on Aggregator: # # alf summarize supplies --by=sid -- total_qty "sum(:qty)" + # alf summarize supplies --allbut --by=pid,qty -- total_qty "sum(:qty)" # class Summarize < Factory::Operator(__FILE__, __LINE__) include Operator::Relational, Operator::Shortcut, Operator::Unary # By attributes attr_accessor :by + # Allbut on by? + attr_accessor :allbut + # Aggregations as a AGG => Aggregator(EXPR) hash attr_accessor :aggregators - def initialize(by = [], aggregators = {}) + def initialize(by = [], aggregators = {}, allbut = false) @by = by + @allbut = allbut @aggregators = aggregators end # Installs the options options do |opt| opt.on('--by=x,y,z', 'Specify by attributes', Array) do |args| @by = args.collect{|a| a.to_sym} end + opt.on('--allbut', 'Make an allbut projection/summarization') do + @allbut = true + end end # Summarizes according to a complete order class SortBased include Alf::Operator::Cesure @@ -2611,11 +2607,46 @@ end receiver.call key.merge(@aggs) end end # class SortBased + + # Summarizes in-memory with a hash + class HashBased + include Operator::Relational, Operator::Unary + + attr_reader :by_key + attr_reader :aggregators + def initialize(by_key, aggregators) + @by_key, @aggregators = by_key, aggregators + end + + protected + + def _each + index = Hash.new do |h,k| + h[k] = tuple_collect(@aggregators) do |a,agg| + [a, agg.least] + end + end + each_input_tuple do |tuple| + key, rest = by_key.split(tuple) + index[key] = tuple_collect(@aggregators) do |a,agg| + [a, agg.happens(index[key][a], tuple)] + end + end + index.each_pair do |key,aggs| + aggs = tuple_collect(@aggregators) do |a,agg| + [a, agg.finalize(aggs[a])] + end + yield key.merge(aggs) + end + end + + end + protected # (see Operator::CommandMethods#set_args) def set_args(args) @aggregators = tuple_collect(args.each_slice(2)) do |a,expr| @@ -2623,14 +2654,20 @@ end self end def longexpr - by_key = Tools::ProjectionKey.new(@by, false) - chain SortBased.new(by_key, @aggregators), - Operator::NonRelational::Sort.new(by_key.to_ordering_key), - datasets + if @allbut + by_key = Tools::ProjectionKey.new(@by, @allbut) + chain HashBased.new(by_key, @aggregators), + datasets + else + by_key = Tools::ProjectionKey.new(@by, @allbut) + chain SortBased.new(by_key, @aggregators), + Operator::NonRelational::Sort.new(by_key.to_ordering_key), + datasets + end end end # class Summarize # @@ -2906,16 +2943,16 @@ def initialize(*attrs) super(nil, {}){ Tools.tuple_collect(attrs){|k| [k, self.send(k)] } } end - def least(); []; end + def least(); Set.new; end def _happens(memo, val) memo << val end def finalize(memo) - memo.uniq + Relation.coerce memo end end # # Defines a COLLECT aggregation operator @@ -2942,11 +2979,10 @@ def finalize(memo) options[:before].to_s + memo + options[:after].to_s end end - Lispy::Agg = Aggregator end # class Aggregator # # Base class for implementing buffers. # @@ -2979,6 +3015,121 @@ end # class Buffer::Sorted end # class Buffer + # + # Implements a small LISP-like DSL on top of Alf. + # + # The lispy dialect is the functional one used in .alf files and in compiled + # expressions as below: + # + # Alf.lispy.compile do + # (restrict :suppliers, lambda{ city == 'London' }) + # end + # + # The DSL this module provides is part of Alf's public API and won't be broken + # without a major version change. The module itself and its inclusion pre- + # conditions are not part of the DSL itself, thus not considered as part of + # the API, and may therefore evolve at any time. In other words, this module + # is not intended to be directly included by third-party classes. + # + module Lispy + + alias :ruby_extend :extend + + # The environment + attr_accessor :environment + + # + # Compiles a query expression given by a String or a block and returns + # the result (typically a tuple iterator) + # + # Example + # + # # with a string + # op = compile "(restrict :suppliers, lambda{ city == 'London' })" + # + # # or with a block + # op = compile { + # (restrict :suppliers, lambda{ city == 'London' }) + # } + # + # @param [String] expr a Lispy expression to compile + # @return [Iterator] the iterator resulting from compilation + # + def compile(expr = nil, path = nil, &block) + if expr.nil? + instance_eval(&block) + else + (path ? Kernel.eval(expr, binding, path) : Kernel.eval(expr, binding)) + end + end + + # + # Evaluates a query expression given by a String or a block and returns + # the result as an in-memory relation (Alf::Relation) + # + # Example: + # + # # with a string + # rel = evaluate "(restrict :suppliers, lambda{ city == 'London' })" + # + # # or with a block + # rel = evaluate { + # (restrict :suppliers, lambda{ city == 'London' }) + # } + # + def evaluate(expr = nil, path = nil, &block) + compile(expr, path, &block).to_rel + end + + # + # Delegated to the current environment + # + # This method returns the dataset associated to a given name. The result + # may depend on the current environment, but is generally an Iterator, + # often a Reader instance. + # + # @param [Symbol] name name of the dataset to retrieve + # @return [Iterator] the dataset as an iterator + # @see Environment#dataset + # + def dataset(name) + raise "Environment not set" unless @environment + @environment.dataset(name) + end + + # Functional equivalent to Alf::Relation[...] + def relation(*tuples) + Relation.coerce(tuples) + end + + # + # Install the DSL through iteration over defined operators + # + Operator::each do |op_class| + meth_name = Tools.ruby_case(Tools.class_name(op_class)).to_sym + if op_class.unary? + define_method(meth_name) do |child, *args| + child = Iterator.coerce(child, environment) + op_class.new(*args).pipe(child, environment) + end + elsif op_class.binary? + define_method(meth_name) do |left, right, *args| + operands = [left, right].collect{|x| Iterator.coerce(x, environment)} + op_class.new(*args).pipe(operands, environment) + end + else + raise "Unexpected operator #{op_class}" + end + end # Operators::each + + def allbut(child, attributes) + (project child, attributes, true) + end + + Agg = Alf::Aggregator + end # module Lispy + end # module Alf +require "alf/relation"