lib/alf.rb in alf-0.9.0 vs lib/alf.rb in alf-0.9.1
- old
+ new
@@ -1,7 +1,8 @@
require "enumerator"
require "stringio"
+require "set"
require "alf/version"
require "alf/loader"
#
# Classy data-manipulation dressed in a DSL (+ commandline)
@@ -236,11 +237,12 @@
@ordering.find{|arg| arg.first == attr}.last
end
def compare(t1,t2)
@ordering.each do |attr,order|
- comp = (t1[attr] <=> t2[attr])
+ x, y = t1[attr], t2[attr]
+ comp = x.respond_to?(:<=>) ? (x <=> y) : (x.to_s <=> y.to_s)
comp *= -1 if order == :desc
return comp unless comp == 0
end
return 0
end
@@ -277,119 +279,11 @@
#
def self.lispy(env = Alf::Environment.default)
Command::Main.new(env)
end
- #
- # Implements a small LISP-like DSL on top of Alf.
#
- # The lispy dialect is the functional one used in .alf files and in compiled
- # expressions as below:
- #
- # Alf.lispy.compile do
- # (restrict :suppliers, lambda{ city == 'London' })
- # end
- #
- # The DSL this module provides is part of Alf's public API and won't be broken
- # without a major version change. The module itself and its inclusion pre-
- # conditions are not part of the DSL itself, thus not considered as part of
- # the API, and may therefore evolve at any time. In other words, this module
- # is not intended to be directly included by third-party classes.
- #
- module Lispy
-
- # The environment
- attr_accessor :environment
-
- #
- # Compiles a query expression given by a String or a block and returns
- # the result (typically a tuple iterator)
- #
- def compile(expr = nil, &block)
- expr.nil? ? instance_eval(&block) : instance_eval(expr)
- end
-
- # Delegated to the environment
- def dataset(name)
- raise "Environment not set" unless @environment
- @environment.dataset(name)
- end
-
- #
- # Compiles the subexpression given by the block in the context of
- # additional temporary expressions given by definitions
- #
- def with(definitions)
- # We branch with the definitions for compilation
- self.environment = environment.branch(definitions)
-
- # this is to ensure that sub definitions can reuse other
- # ones
- definitions.each_value do |defn|
- defn.environment = self.environment
- end
-
- # compile now
- op = compile(&Proc.new)
-
- # We now unbranch for next expression
- self.environment = environment.unbranch
-
- op
- end
-
- #
- # Chains some elements as a new operator
- #
- def chain(*elements)
- elements = elements.reverse
- elements[1..-1].inject(elements.first) do |c, elm|
- elm.pipe(c, environment)
- elm
- end
- end
-
- [ :Autonum, :Clip, :Compact, :Defaults, :Sort ].each do |op_name|
- meth_name = Tools.ruby_case(op_name).to_sym
- define_method(meth_name) do |child, *args|
- chain(Operator::NonRelational.const_get(op_name).new(*args), child)
- end
- end
-
- [:Project,
- :Extend,
- :Rename,
- :Restrict,
- :Nest,
- :Unnest,
- :Group,
- :Ungroup,
- :Summarize,
- :Quota ].each do |op_name|
- meth_name = Tools.ruby_case(op_name).to_sym
- define_method(meth_name) do |child, *args|
- chain(Operator::Relational.const_get(op_name).new(*args), child)
- end
- end
-
- def allbut(child, attributes)
- chain(Operator::Relational::Project.new(attributes, true), child)
- end
-
- [ :Join,
- :Union,
- :Intersect,
- :Minus ].each do |op_name|
- meth_name = Tools.ruby_case(op_name).to_sym
- define_method(meth_name) do |left, right, *args|
- chain(Operator::Relational.const_get(op_name).new(*args), [left, right])
- end
- end
-
- end # module Lispy
-
- #
# Encapsulates the interface with the outside world, providing base iterators
# for named datasets, among others.
#
# An environment is typically obtained through the factory defined by this
# class:
@@ -572,28 +466,39 @@
#
# @param [Object] input the iterator input, at discretion of the Iterator
# implementation.
# @param [Environment] environment an optional environment for resolving
# named datasets if needed.
+ # @return [Object] self
#
def pipe(input, environment = nil)
+ self
end
undef :pipe
-
#
# Coerces something to an iterator
#
- def self.coerce(arg, env)
+ def self.coerce(arg, environment = nil)
case arg
when Iterator, Array
arg
else
- Reader.coerce(arg, env)
+ Reader.coerce(arg, environment)
end
end
+ #
+ # Converts this iterator to an in-memory Relation.
+ #
+ # @return [Relation] a relation instance, as the set of tuples
+ # that would be yield by this iterator.
+ #
+ def to_rel
+ Relation::coerce(self)
+ end
+
end # module Iterator
#
# Implements an Iterator at the interface with the outside world.
#
@@ -648,24 +553,29 @@
clazz.new(*args)
end
end
#
- # Returns a reader instance for a specific file whose path is given
- # as argument.
+ # When filepath is a String, returns a reader instance for a specific file
+ # whose path is given as argument. Otherwise, delegate the call to
+ # <code>coerce(filepath)</code>
#
# @param [String] filepath path to a file for which extension is recognized
# @param [Array] args optional additional arguments that must be passed at
# reader's class new method.
# @return [Reader] a reader instance
#
def self.reader(filepath, *args)
- ext = File.extname(filepath)
- if registered = @@readers.find{|r| r[1].include?(ext)}
- registered[2].new(filepath, *args)
+ if filepath.is_a?(String)
+ ext = File.extname(filepath)
+ if registered = @@readers.find{|r| r[1].include?(ext)}
+ registered[2].new(filepath, *args)
+ else
+ raise "No registered reader for #{ext} (#{filepath})"
+ end
else
- raise "No registered reader for #{ext} (#{filepath})"
+ coerce(filepath)
end
end
#
# Coerces an argument to a reader, using an optional environment to convert
@@ -713,10 +623,11 @@
#
# (see Iterator#pipe)
#
def pipe(input, env = environment)
@input = input
+ self
end
#
# (see Iterator#each)
#
@@ -732,10 +643,18 @@
end
protected
#
+ # Returns the input file path, or nil if this Reader is bound to an IO
+ # directly.
+ #
+ def input_path
+ input.is_a?(String) ? input : nil
+ end
+
+ #
# Coerces the input object to an IO and yields the block with it.
#
# StringIO and IO input are yield directly while file paths are first
# opened in read mode and then yield.
#
@@ -820,11 +739,11 @@
#
class AlfFile < Reader
# (see Reader#each)
def each
- op = Alf.lispy(environment).compile(input_text)
+ op = Alf.lispy(environment).compile(input_text, input_path)
op.each(&Proc.new)
end
Reader.register(:alf, [".alf"], self)
end # module AlfFile
@@ -921,10 +840,11 @@
# This method mimics {Iterator#pipe} and have the same contract.
#
def pipe(input, env = environment)
self.environment = env
self.input = input
+ self
end
#
# Executes the rendering, outputting the resulting tuples on the provided
# output buffer.
@@ -1031,21 +951,22 @@
# }}
#
# See '#{program_name} help COMMAND' for details about a specific command.
#
class Main < Quickl::Delegator(__FILE__, __LINE__)
- include Command, Lispy
+ include Command
# Environment instance to use to get base iterators
attr_accessor :environment
# Output renderer
attr_accessor :renderer
# Creates a command instance
def initialize(env = Environment.default)
@environment = env
+ extend(Lispy)
end
# Install options
options do |opt|
@execute = false
@@ -1096,11 +1017,11 @@
end
# 3) if there is a requester, then we do the job (assuming bin/alf)
# with the renderer to use. Otherwise, we simply return built operator
if operator && requester
- chain(renderer, operator).execute($stdout)
+ renderer.pipe(operator, environment).execute($stdout)
else
operator
end
end
@@ -1136,11 +1057,11 @@
end
def execute(args)
requester.renderer = @renderer
args = [ $stdin ] if args.empty?
- requester.chain(*args)
+ requester.send(:chain,*args)
end
end # class Show
#
@@ -1196,11 +1117,46 @@
#
# Marker for all operators, relational and non-relational ones.
#
module Operator
include Iterator, Tools
+
+ #
+ # Yields non-relational then relational operators, in turn.
+ #
+ def self.each
+ Operator::NonRelational.each{|x| yield(x)}
+ Operator::Relational.each{|x| yield(x)}
+ end
+ #
+ # Encapsulates method that allows making operator introspection, that is,
+ # knowing operator cardinality and similar stuff.
+ #
+ module Introspection
+
+ #
+ # Returns true if this operator is an unary operator, false otherwise
+ #
+ def unary?
+ ancestors.include?(Operator::Unary)
+ end
+
+ #
+ # Returns true if this operator is a binary operator, false otherwise
+ #
+ def binary?
+ ancestors.include?(Operator::Binary)
+ end
+
+ end # module Introspection
+
+ # Ensures that the Introspection module is set on real operators
+ def self.included(mod)
+ mod.extend(Introspection) if mod.is_a?(Class)
+ end
+
#
# Encapsulates method definitions that convert operators to Quickl
# commands
#
module CommandMethods
@@ -1236,11 +1192,11 @@
end
self
end
def split_command_args(args)
- operands, args = case i = args.index("--")
+ case (i = args.index("--"))
when NilClass
[args, []]
when 0
[[ $stdin ], args[1..-1]]
else
@@ -1326,16 +1282,17 @@
# Sets the operator input
#
def pipe(input, env = environment)
self.environment = env
self.datasets = [ input ]
+ self
end
protected
def command_line_operands(operands)
- operands.first
+ operands.first || $stdin
end
#
# Simply returns the first dataset
#
@@ -1364,10 +1321,11 @@
# Sets the operator input
#
def pipe(input, env = environment)
self.environment = env
self.datasets = input
+ self
end
protected
def command_line_operands(operands)
@@ -1459,10 +1417,11 @@
# Sets the operator input
#
def pipe(input, env = environment)
self.environment = env
self.datasets = input
+ self
end
protected
# (see Operator#_each)
@@ -1502,30 +1461,46 @@
#
# Marker module and namespace for non relational operators
#
module Operator::NonRelational
+ #
+ # Yields the block with each operator module in turn
+ #
+ def self.each
+ constants.each do |c|
+ val = const_get(c)
+ yield(val) if val.ancestors.include?(Operator::NonRelational)
+ end
+ end
+
#
- # Extend with an unique autonumber attribute
+ # Extend its operand with an unique autonumber attribute
#
# SYNOPSIS
- # #{program_name} #{command_name} [OPERAND] -- [ATTRNAME]
#
- # API & EXAMPLE
+ # #{program_name} #{command_name} [OPERAND] -- [ATTRNAME]
#
- # # Autonumber suppliers (:autonum attribute name by default)
- # (autonum :suppliers)
+ # DESCRIPTION
#
- # # You can specify the attribute name
+ # This non-relational operator guarantees uniqueness of output tuples by
+ # adding an attribute called 'ATTRNAME' whose value is an Integer. No
+ # guarantee is given about ordering of output tuples, nor to the fact
+ # that this autonumber is sequential. Only that all values are different.
+ # If the presence of duplicates was the only "non-relational" aspect of
+ # input tuples, the result may be considered a valid relation representation.
+ #
+ # IN RUBY
+ #
+ # (autonum OPERAND, ATTRNAME = :autonum)
+ #
+ # (autonum :suppliers)
# (autonum :suppliers, :unique_id)
#
- # DESCRIPTION
+ # IN SHELL
#
- # This operator takes input tuples in any order they come and extends them
- # with an autonumber attribute ATTRNAME. This allows converting non-relational
- # tuple enumerators to relational ones by ensuring uniqueness of tuples in an
- # arbitrary manner.
+ # #{program_name} #{command_name} [OPERAND] -- [ATTRNAME]
#
# alf autonum suppliers
# alf autonum suppliers -- unique_id
#
class Autonum < Factory::Operator(__FILE__, __LINE__)
@@ -1843,10 +1818,19 @@
# Marker module and namespace for relational operators
#
module Operator::Relational
#
+ # Yields the block with each operator module in turn
+ #
+ def self.each
+ constants.each do |c|
+ val = const_get(c)
+ yield(val) if val.ancestors.include?(Operator::Relational)
+ end
+ end
+
# Relational projection (clip + compact)
#
# SYNOPSIS
# #{program_name} #{command_name} [OPERAND] -- ATTR1 ATTR2 ...
#
@@ -2307,39 +2291,39 @@
end
end # class Union
#
- # Relational nesting (tuple-valued attributes)
+ # Relational wraping (tuple-valued attributes)
#
# SYNOPSIS
# #{program_name} #{command_name} [OPERAND] -- ATTR1 ATTR2 ... NEWNAME
#
# API & EXAMPLE
#
- # (nest :suppliers, [:city, :status], :loc_and_status)
+ # (wrap :suppliers, [:city, :status], :loc_and_status)
#
# DESCRIPTION
#
- # This operator nests attributes ATTR1 to ATTRN as a new, tuple-based
- # attribute whose name is NEWNAME. When used in shell, names of nested
+ # This operator wraps attributes ATTR1 to ATTRN as a new, tuple-based
+ # attribute whose name is NEWNAME. When used in shell, names of wrapped
# attributes are taken from commandline arguments, expected the last one
# which defines the new name to use:
#
- # alf nest suppliers -- city status loc_and_status
+ # alf wrap suppliers -- city status loc_and_status
#
- class Nest < Factory::Operator(__FILE__, __LINE__)
+ class Wrap < Factory::Operator(__FILE__, __LINE__)
include Operator::Relational, Operator::Transform
- # Array of nesting attributes
+ # Array of wraping attributes
attr_accessor :attributes
- # New name for the nested attribute
+ # New name for the wrapped attribute
attr_accessor :as
- # Builds a Nest operator instance
- def initialize(attributes = [], as = :nested)
+ # Builds a Wrap operator instance
+ def initialize(attributes = [], as = :wrapped)
@attributes = attributes
@as = as
end
protected
@@ -2356,40 +2340,40 @@
others = tuple_collect(tuple.keys - @attributes){|k| [k,tuple[k]] }
others[as] = tuple_collect(attributes){|k| [k, tuple[k]] }
others
end
- end # class Nest
+ end # class Wrap
#
- # Relational un-nesting (inverse of nest)
+ # Relational un-wraping (inverse of wrap)
#
# SYNOPSIS
# #{program_name} #{command_name} [OPERAND] -- ATTR
#
# API & EXAMPLE
#
- # # Assuming nested = (nest :suppliers, [:city, :status], :loc_and_status)
- # (unnest nested, :loc_and_status)
+ # # Assuming wrapped = (wrap :suppliers, [:city, :status], :loc_and_status)
+ # (unwrap wrapped, :loc_and_status)
#
# DESCRIPTION
#
- # This operator unnests the tuple-valued attribute named ATTR so as to
+ # This operator unwraps the tuple-valued attribute named ATTR so as to
# flatten its pairs with 'upstream' tuple. The latter should be such so that
# no name collision occurs. When used in shell, the name of the attribute to
- # unnest is taken as the first commandline argument:
+ # unwrap is taken as the first commandline argument:
#
- # alf unnest nest -- loc_and_status
+ # alf unwrap wrap -- loc_and_status
#
- class Unnest < Factory::Operator(__FILE__, __LINE__)
+ class Unwrap < Factory::Operator(__FILE__, __LINE__)
include Operator::Relational, Operator::Transform
- # Name of the attribute to unnest
+ # Name of the attribute to unwrap
attr_accessor :attribute
# Builds a Rename operator instance
- def initialize(attribute = :nested)
+ def initialize(attribute = :wrapped)
@attribute = attribute
end
protected
@@ -2400,15 +2384,15 @@
end
# (see Operator::Transform#_tuple2tuple)
def _tuple2tuple(tuple)
tuple = tuple.dup
- nested = tuple.delete(@attribute) || {}
- tuple.merge(nested)
+ wrapped = tuple.delete(@attribute) || {}
+ tuple.merge(wrapped)
end
- end # class Unnest
+ end # class Unwrap
#
# Relational grouping (relation-valued attributes)
#
# SYNOPSIS
@@ -2462,21 +2446,21 @@
end
# See Operator#_prepare
def _prepare
pkey = ProjectionKey.new(attributes, !allbut)
- @index = Hash.new{|h,k| h[k] = []}
+ @index = Hash.new{|h,k| h[k] = Set.new}
each_input_tuple do |tuple|
key, rest = pkey.split(tuple)
@index[key] << rest
end
end
# See Operator#_each
def _each
@index.each_pair do |k,v|
- yield(k.merge(@as => v))
+ yield(k.merge(@as => Relation.coerce(v)))
end
end
end # class Group
@@ -2535,20 +2519,24 @@
#
# Relational summarization (group-by + aggregate ops)
#
# SYNOPSIS
- # #{program_name} #{command_name} [OPERAND] --by=KEY1,KEY2... -- AGG1 EXPR1...
+ # #{program_name} #{command_name} [OPERAND] [--allbut] --by=KEY1,KEY2... -- AGG1 EXPR1...
#
# OPTIONS
# #{summarized_options}
#
# API & EXAMPLE
#
# (summarize :supplies, [:sid],
# :total_qty => Aggregator.sum(:qty))
#
+ # # Or, to specify an allbut projection
+ # (summarize :supplies, [:qty, :pid],
+ # :total_qty => Aggregator.sum(:qty), true)
+ #
# DESCRIPTION
#
# This operator summarizes input tuples on the projection on KEY1,KEY2,...
# attributes and applies aggregate operators on sets of matching tuples.
# Introduced names AGG should be disjoint from KEY attributes.
@@ -2556,30 +2544,38 @@
# When used in shell, the aggregations are taken from commandline arguments
# AGG and EXPR, where AGG is the name of a new attribute and EXPR is an
# aggregation expression evaluated on Aggregator:
#
# alf summarize supplies --by=sid -- total_qty "sum(:qty)"
+ # alf summarize supplies --allbut --by=pid,qty -- total_qty "sum(:qty)"
#
class Summarize < Factory::Operator(__FILE__, __LINE__)
include Operator::Relational, Operator::Shortcut, Operator::Unary
# By attributes
attr_accessor :by
+ # Allbut on by?
+ attr_accessor :allbut
+
# Aggregations as a AGG => Aggregator(EXPR) hash
attr_accessor :aggregators
- def initialize(by = [], aggregators = {})
+ def initialize(by = [], aggregators = {}, allbut = false)
@by = by
+ @allbut = allbut
@aggregators = aggregators
end
# Installs the options
options do |opt|
opt.on('--by=x,y,z', 'Specify by attributes', Array) do |args|
@by = args.collect{|a| a.to_sym}
end
+ opt.on('--allbut', 'Make an allbut projection/summarization') do
+ @allbut = true
+ end
end
# Summarizes according to a complete order
class SortBased
include Alf::Operator::Cesure
@@ -2611,11 +2607,46 @@
end
receiver.call key.merge(@aggs)
end
end # class SortBased
+
+ # Summarizes in-memory with a hash
+ class HashBased
+ include Operator::Relational, Operator::Unary
+
+ attr_reader :by_key
+ attr_reader :aggregators
+ def initialize(by_key, aggregators)
+ @by_key, @aggregators = by_key, aggregators
+ end
+
+ protected
+
+ def _each
+ index = Hash.new do |h,k|
+ h[k] = tuple_collect(@aggregators) do |a,agg|
+ [a, agg.least]
+ end
+ end
+ each_input_tuple do |tuple|
+ key, rest = by_key.split(tuple)
+ index[key] = tuple_collect(@aggregators) do |a,agg|
+ [a, agg.happens(index[key][a], tuple)]
+ end
+ end
+ index.each_pair do |key,aggs|
+ aggs = tuple_collect(@aggregators) do |a,agg|
+ [a, agg.finalize(aggs[a])]
+ end
+ yield key.merge(aggs)
+ end
+ end
+
+ end
+
protected
# (see Operator::CommandMethods#set_args)
def set_args(args)
@aggregators = tuple_collect(args.each_slice(2)) do |a,expr|
@@ -2623,14 +2654,20 @@
end
self
end
def longexpr
- by_key = Tools::ProjectionKey.new(@by, false)
- chain SortBased.new(by_key, @aggregators),
- Operator::NonRelational::Sort.new(by_key.to_ordering_key),
- datasets
+ if @allbut
+ by_key = Tools::ProjectionKey.new(@by, @allbut)
+ chain HashBased.new(by_key, @aggregators),
+ datasets
+ else
+ by_key = Tools::ProjectionKey.new(@by, @allbut)
+ chain SortBased.new(by_key, @aggregators),
+ Operator::NonRelational::Sort.new(by_key.to_ordering_key),
+ datasets
+ end
end
end # class Summarize
#
@@ -2906,16 +2943,16 @@
def initialize(*attrs)
super(nil, {}){
Tools.tuple_collect(attrs){|k| [k, self.send(k)] }
}
end
- def least(); []; end
+ def least(); Set.new; end
def _happens(memo, val)
memo << val
end
def finalize(memo)
- memo.uniq
+ Relation.coerce memo
end
end
#
# Defines a COLLECT aggregation operator
@@ -2942,11 +2979,10 @@
def finalize(memo)
options[:before].to_s + memo + options[:after].to_s
end
end
- Lispy::Agg = Aggregator
end # class Aggregator
#
# Base class for implementing buffers.
#
@@ -2979,6 +3015,121 @@
end # class Buffer::Sorted
end # class Buffer
+ #
+ # Implements a small LISP-like DSL on top of Alf.
+ #
+ # The lispy dialect is the functional one used in .alf files and in compiled
+ # expressions as below:
+ #
+ # Alf.lispy.compile do
+ # (restrict :suppliers, lambda{ city == 'London' })
+ # end
+ #
+ # The DSL this module provides is part of Alf's public API and won't be broken
+ # without a major version change. The module itself and its inclusion pre-
+ # conditions are not part of the DSL itself, thus not considered as part of
+ # the API, and may therefore evolve at any time. In other words, this module
+ # is not intended to be directly included by third-party classes.
+ #
+ module Lispy
+
+ alias :ruby_extend :extend
+
+ # The environment
+ attr_accessor :environment
+
+ #
+ # Compiles a query expression given by a String or a block and returns
+ # the result (typically a tuple iterator)
+ #
+ # Example
+ #
+ # # with a string
+ # op = compile "(restrict :suppliers, lambda{ city == 'London' })"
+ #
+ # # or with a block
+ # op = compile {
+ # (restrict :suppliers, lambda{ city == 'London' })
+ # }
+ #
+ # @param [String] expr a Lispy expression to compile
+ # @return [Iterator] the iterator resulting from compilation
+ #
+ def compile(expr = nil, path = nil, &block)
+ if expr.nil?
+ instance_eval(&block)
+ else
+ (path ? Kernel.eval(expr, binding, path) : Kernel.eval(expr, binding))
+ end
+ end
+
+ #
+ # Evaluates a query expression given by a String or a block and returns
+ # the result as an in-memory relation (Alf::Relation)
+ #
+ # Example:
+ #
+ # # with a string
+ # rel = evaluate "(restrict :suppliers, lambda{ city == 'London' })"
+ #
+ # # or with a block
+ # rel = evaluate {
+ # (restrict :suppliers, lambda{ city == 'London' })
+ # }
+ #
+ def evaluate(expr = nil, path = nil, &block)
+ compile(expr, path, &block).to_rel
+ end
+
+ #
+ # Delegated to the current environment
+ #
+ # This method returns the dataset associated to a given name. The result
+ # may depend on the current environment, but is generally an Iterator,
+ # often a Reader instance.
+ #
+ # @param [Symbol] name name of the dataset to retrieve
+ # @return [Iterator] the dataset as an iterator
+ # @see Environment#dataset
+ #
+ def dataset(name)
+ raise "Environment not set" unless @environment
+ @environment.dataset(name)
+ end
+
+ # Functional equivalent to Alf::Relation[...]
+ def relation(*tuples)
+ Relation.coerce(tuples)
+ end
+
+ #
+ # Install the DSL through iteration over defined operators
+ #
+ Operator::each do |op_class|
+ meth_name = Tools.ruby_case(Tools.class_name(op_class)).to_sym
+ if op_class.unary?
+ define_method(meth_name) do |child, *args|
+ child = Iterator.coerce(child, environment)
+ op_class.new(*args).pipe(child, environment)
+ end
+ elsif op_class.binary?
+ define_method(meth_name) do |left, right, *args|
+ operands = [left, right].collect{|x| Iterator.coerce(x, environment)}
+ op_class.new(*args).pipe(operands, environment)
+ end
+ else
+ raise "Unexpected operator #{op_class}"
+ end
+ end # Operators::each
+
+ def allbut(child, attributes)
+ (project child, attributes, true)
+ end
+
+ Agg = Alf::Aggregator
+ end # module Lispy
+
end # module Alf
+require "alf/relation"