lib/wukong/schema.rb in wukong-0.1.1 vs lib/wukong/schema.rb in wukong-0.1.4
- old
+ new
@@ -1,37 +1,220 @@
+require 'extlib/inflection'
+require 'wukong'
+
+
+#
+# Basic types: SQL conversion
+#
+class << Integer ; def to_sql() 'INT' end ; end
+class << Bignum ; def to_sql() 'BIGINT' end ; end
+class << String ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
+class << Symbol ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
+class << BigDecimal ; def to_pig() 'DECIMAL' end ; end if defined?(BigDecimal)
+class << EpochTime ; def to_pig() 'INT' end ; end if defined?(EpochTime)
+class << FilePath ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
+class << Flag ; def to_pig() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag)
+class << IPAddress ; def to_pig() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress)
+class << URI ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
+class << Csv ; def to_pig() 'TEXT' end ; end if defined?(Csv)
+class << Yaml ; def to_pig() 'TEXT' end ; end if defined?(Yaml)
+class << Json ; def to_pig() 'TEXT' end ; end if defined?(Json)
+class << Regex ; def to_pig() 'TEXT' end ; end if defined?(Regex)
+class String ; def to_sql() self ; end ; end
+class Symbol ; def to_sql() self.to_s.upcase ; end ; end
+
+#
+# Basic types: Pig conversion
+#
+class << Integer ; def to_pig() 'int' end ; end
+class << Bignum ; def to_pig() 'long' end ; end
+class << Float ; def to_pig() 'float' end ; end
+class << Symbol ; def to_pig() 'chararray' end ; end
+class << Date ; def to_pig() 'long' end ; end
+class << Time ; def to_pig() 'long' end ; end
+class << DateTime ; def to_pig() 'long' end ; end
+class << String ; def to_pig() 'chararray' end ; end
+class << Text ; def to_pig() 'chararray' end ; end if defined?(Text)
+class << Blob ; def to_pig() 'bytearray' end ; end if defined?(Blob)
+class << Boolean ; def to_pig() 'bytearray' end ; end if defined?(Boolean)
+class String ; def to_pig() self.to_s ; end ; end
+class Symbol ; def to_pig() self.to_s ; end ; end
+
+class << BigDecimal ; def to_pig() 'long' end ; end if defined?(BigDecimal)
+class << EpochTime ; def to_pig() 'integer' end ; end if defined?(EpochTime)
+class << FilePath ; def to_pig() 'chararray' end ; end if defined?(FilePath)
+class << Flag ; def to_pig() 'chararray' end ; end if defined?(Flag)
+class << IPAddress ; def to_pig() 'chararray' end ; end if defined?(IPAddress)
+class << URI ; def to_pig() 'chararray' end ; end if defined?(URI)
+class << Csv ; def to_pig() 'chararray' end ; end if defined?(Csv)
+class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml)
+class << Json ; def to_pig() 'chararray' end ; end if defined?(Json)
+class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex)
+
module Wukong
#
- # Export model's structure for other data frameworks:
- # SQL and Pig
+ # Export model's structure for loading and manipulating in other frameworks,
+ # such as SQL and Pig
#
+ # Your class should support the #resource_name and #mtypes methods
+ # An easy way to do this is by being a TypedStruct.
+ #
+ # You can use this to do silly stunts like
+ #
+ # % ruby -rubygems -r'wukong/schema' -e 'require "/path/to/user_model.rb" ; puts User.pig_load ; '
+ #
+ # If you include the classes from Wukong::Datatypes::MoreTypes, you can draw
+ # on a richer set of type definitions
+ #
+ # require 'wukong/datatypes/more_types'
+ # include Wukong::Datatypes::MoreTypes
+ # require 'wukong/schema'
+ #
+ # (if you're using Wukong to bulk-process Datamapper records, these should
+ # fall right in line as well -- make sure *not* to include
+ # Wukong::Datatypes::MoreTypes, and to require 'dm-more' before 'wukong/schema')
+ #
module Schema
- def to_sql
- end
+ module ClassMethods
+ #
+ # Table name for this class
+ #
+ def table_name
+ resource_name.to_s.pluralize
+ end
- # Export schema as Pig
- def to_pig
- members.zip(mtypes).map do |member, type|
- member.to_s + ': ' + type.to_pig
- end.join(', ')
- end
+ # ===========================================================================
+ #
+ # Pig
+ #
- def pig_klass
- self.to_s.gsub(/.*::/, '')
- end
+ # Export schema as Pig
+ #
+ # Won't correctly handle complex types (struct having struct as member, eg)
+ #
+ def to_pig
+ members.zip(mtypes).map do |member, type|
+ member.to_s + ': ' + type.to_pig
+ end.join(', ')
+ end
- def pig_load filename=nil
- cmd = [
- "%-23s" % pig_klass,
- "= LOAD", filename || pig_klass.underscore.pluralize,
- "AS ( rsrc:chararray,", self.to_pig, ')',
- ].join(" ")
+ #
+ # A pig snippet to load a tsv file containing
+ # serialized instances of this class.
+ #
+ # Assumes the first column is the resource name (you can, and probably
+ # should, follow with an immediate GENERATE to ditch that field.)
+ #
+ def pig_load filename=nil
+ filename ||= table_name+'.tsv'
+ cmd = [
+ "%-23s" % resource_name,
+ "= LOAD", filename,
+ "AS ( rsrc:chararray,", self.to_pig, ')',
+ ].join(" ")
+ end
+
+ # ===========================================================================
+ #
+ # SQL
+
+ #
+ # Schema definition for use in a CREATE TABLE statement
+ #
+ def to_sql
+ sql_str = []
+ members.zip(mtypes).each do |attr, type|
+ type_str = type.respond_to?(:to_sql) ? type.to_sql : type.to_s.upcase
+ sql_str << " %-21s\t%s" %["`#{attr}`", type_str]
+ end
+ sql_str.join(",\n")
+ end
+
+ #
+ # List off member names, to be stuffed into a SELECT or a LOAD DATA
+ #
+ def sql_members
+ members.map{|attr| "`#{attr}`" }.join(", ")
+ end
+
+ #
+ # Creates a table for the wukong class.
+ #
+ # * primary_key gives the name of one column to be set as the primary key
+ #
+ # * if drop_first is given, a "DROP TABLE IF EXISTS" statement will
+ # precede the snippet.
+ #
+ # * table_options sets the table parameters. Useful table_options for a
+ # read-only database in MySQL:
+ # ENGINE=MyISAM PACK_KEYS=0
+ #
+ def sql_create_table primary_key=nil, drop_first=nil, table_options=''
+ str = []
+ str << %Q{DROP TABLE IF EXISTS `#{self.table_name}`; } if drop_first
+ str << %Q{CREATE TABLE `#{self.table_name}` ( }
+ str << self.to_sql
+ if primary_key then str.last << ',' ; str << %Q{ PRIMARY KEY \t(`#{primary_key}`)} ; end
+ str << %Q{ ) #{table_options} ;}
+ str.join("\n")
+ end
+
+ #
+ # A mysql snippet to bulk load the tab-separated-values file emitted by a
+ # Wukong script.
+ #
+ # Let's say your class is ClickLog; its resource_name is "click_log"
+ # and thus its table_name is 'click_logs'. sql_load_mysql will:
+ #
+ # * disable indexing on the table
+ # * import the file, replacing any existing rows. (Replacement is governed
+ # by primary key and unique index constraints -- see the mysql docs).
+ # * re-enable indexing on that table
+ # * show the number of
+ #
+ # The load portion will
+ #
+ # * Load into a table named click_logs
+ # * from a file named click_logs.tsv
+ # * where all rows have the string 'click_logs' in their first column
+ # * and all remaining fields in their #members order
+ # * assuming strings are wukong_encode'd and so shouldn't be escaped or enclosed.
+ #
+ # Why the "LINES STARTING BY" part? For map/reduce outputs that have many
+ # different objects jumbled together, you can just dump in the whole file,
+ # landing each object in its correct table.
+ #
+ def sql_load_mysql
+ str = []
+ # disable indexing during bulk load
+ str << %Q{ALTER TABLE `#{self.resource_name}` DISABLE KEYS; }
+ # Bulk load the tab-separated-values file.
+ str << %Q{LOAD DATA LOCAL INFILE '#{self.resource_name}.tsv'}
+ str << %Q{ REPLACE INTO TABLE `#{self.resource_name}` }
+ str << %Q{ COLUMNS }
+ str << %Q{ TERMINATED BY '\\t' }
+ str << %Q{ OPTIONALLY ENCLOSED BY '' }
+ str << %Q{ ESCAPED BY '' }
+ str << %Q{ LINES STARTING BY '#{self.resource_name}' }
+ str << %Q{ ( @dummy,\n }
+ str << ' '+self.sql_members
+ str << %Q{\n ); }
+ # Re-enable indexing
+ str << %Q{ALTER TABLE `#{self.resource_name}` ENABLE KEYS ; }
+ # Show it loaded correctly
+ str << %Q{SELECT '#{self.resource_name}', NOW(), COUNT(*) FROM `#{self.resource_name}`; }
+ str.join("\n")
+ end
+
end
+ # standard stanza for making methods appear on the class itself on include
+ def self.included base
+ base.class_eval{ extend ClassMethods }
+ end
end
end
-class << Integer ; def to_pig() 'int' end ; end
-class << Bignum ; def to_pig() 'long' end ; end
-class << Float ; def to_pig() 'float' end ; end
-class << String ; def to_pig() 'chararray' end ; end
-class << Symbol ; def to_pig() self end ; end
-class << Date ; def to_pig() 'long' end ; end
+#
+# TypedStructs are class-schematizeable
+#
+Struct.class_eval do include(Wukong::Schema) ; end