lib/wukong/schema.rb in wukong-0.1.1 vs lib/wukong/schema.rb in wukong-0.1.4

- old
+ new

@@ -1,37 +1,220 @@ +require 'extlib/inflection' +require 'wukong' + + +# +# Basic types: SQL conversion +# +class << Integer ; def to_sql() 'INT' end ; end +class << Bignum ; def to_sql() 'BIGINT' end ; end +class << String ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end +class << Symbol ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end +class << BigDecimal ; def to_pig() 'DECIMAL' end ; end if defined?(BigDecimal) +class << EpochTime ; def to_pig() 'INT' end ; end if defined?(EpochTime) +class << FilePath ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath) +class << Flag ; def to_pig() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag) +class << IPAddress ; def to_pig() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress) +class << URI ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI) +class << Csv ; def to_pig() 'TEXT' end ; end if defined?(Csv) +class << Yaml ; def to_pig() 'TEXT' end ; end if defined?(Yaml) +class << Json ; def to_pig() 'TEXT' end ; end if defined?(Json) +class << Regex ; def to_pig() 'TEXT' end ; end if defined?(Regex) +class String ; def to_sql() self ; end ; end +class Symbol ; def to_sql() self.to_s.upcase ; end ; end + +# +# Basic types: Pig conversion +# +class << Integer ; def to_pig() 'int' end ; end +class << Bignum ; def to_pig() 'long' end ; end +class << Float ; def to_pig() 'float' end ; end +class << Symbol ; def to_pig() 'chararray' end ; end +class << Date ; def to_pig() 'long' end ; end +class << Time ; def to_pig() 'long' end ; end +class << DateTime ; def to_pig() 'long' end ; end +class << String ; def to_pig() 'chararray' end ; end +class << Text ; def to_pig() 'chararray' end ; end if defined?(Text) +class << Blob ; def to_pig() 'bytearray' end ; end if defined?(Blob) +class << Boolean ; def to_pig() 'bytearray' end ; end if defined?(Boolean) +class String ; def to_pig() self.to_s ; end ; end +class Symbol ; def to_pig() self.to_s ; end ; end + +class << BigDecimal ; def to_pig() 'long' end ; end if defined?(BigDecimal) +class << EpochTime ; def to_pig() 'integer' end ; end if defined?(EpochTime) +class << FilePath ; def to_pig() 'chararray' end ; end if defined?(FilePath) +class << Flag ; def to_pig() 'chararray' end ; end if defined?(Flag) +class << IPAddress ; def to_pig() 'chararray' end ; end if defined?(IPAddress) +class << URI ; def to_pig() 'chararray' end ; end if defined?(URI) +class << Csv ; def to_pig() 'chararray' end ; end if defined?(Csv) +class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml) +class << Json ; def to_pig() 'chararray' end ; end if defined?(Json) +class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex) + module Wukong # - # Export model's structure for other data frameworks: - # SQL and Pig + # Export model's structure for loading and manipulating in other frameworks, + # such as SQL and Pig # + # Your class should support the #resource_name and #mtypes methods + # An easy way to do this is by being a TypedStruct. + # + # You can use this to do silly stunts like + # + # % ruby -rubygems -r'wukong/schema' -e 'require "/path/to/user_model.rb" ; puts User.pig_load ; ' + # + # If you include the classes from Wukong::Datatypes::MoreTypes, you can draw + # on a richer set of type definitions + # + # require 'wukong/datatypes/more_types' + # include Wukong::Datatypes::MoreTypes + # require 'wukong/schema' + # + # (if you're using Wukong to bulk-process Datamapper records, these should + # fall right in line as well -- make sure *not* to include + # Wukong::Datatypes::MoreTypes, and to require 'dm-more' before 'wukong/schema') + # module Schema - def to_sql - end + module ClassMethods + # + # Table name for this class + # + def table_name + resource_name.to_s.pluralize + end - # Export schema as Pig - def to_pig - members.zip(mtypes).map do |member, type| - member.to_s + ': ' + type.to_pig - end.join(', ') - end + # =========================================================================== + # + # Pig + # - def pig_klass - self.to_s.gsub(/.*::/, '') - end + # Export schema as Pig + # + # Won't correctly handle complex types (struct having struct as member, eg) + # + def to_pig + members.zip(mtypes).map do |member, type| + member.to_s + ': ' + type.to_pig + end.join(', ') + end - def pig_load filename=nil - cmd = [ - "%-23s" % pig_klass, - "= LOAD", filename || pig_klass.underscore.pluralize, - "AS ( rsrc:chararray,", self.to_pig, ')', - ].join(" ") + # + # A pig snippet to load a tsv file containing + # serialized instances of this class. + # + # Assumes the first column is the resource name (you can, and probably + # should, follow with an immediate GENERATE to ditch that field.) + # + def pig_load filename=nil + filename ||= table_name+'.tsv' + cmd = [ + "%-23s" % resource_name, + "= LOAD", filename, + "AS ( rsrc:chararray,", self.to_pig, ')', + ].join(" ") + end + + # =========================================================================== + # + # SQL + + # + # Schema definition for use in a CREATE TABLE statement + # + def to_sql + sql_str = [] + members.zip(mtypes).each do |attr, type| + type_str = type.respond_to?(:to_sql) ? type.to_sql : type.to_s.upcase + sql_str << " %-21s\t%s" %["`#{attr}`", type_str] + end + sql_str.join(",\n") + end + + # + # List off member names, to be stuffed into a SELECT or a LOAD DATA + # + def sql_members + members.map{|attr| "`#{attr}`" }.join(", ") + end + + # + # Creates a table for the wukong class. + # + # * primary_key gives the name of one column to be set as the primary key + # + # * if drop_first is given, a "DROP TABLE IF EXISTS" statement will + # precede the snippet. + # + # * table_options sets the table parameters. Useful table_options for a + # read-only database in MySQL: + # ENGINE=MyISAM PACK_KEYS=0 + # + def sql_create_table primary_key=nil, drop_first=nil, table_options='' + str = [] + str << %Q{DROP TABLE IF EXISTS `#{self.table_name}`; } if drop_first + str << %Q{CREATE TABLE `#{self.table_name}` ( } + str << self.to_sql + if primary_key then str.last << ',' ; str << %Q{ PRIMARY KEY \t(`#{primary_key}`)} ; end + str << %Q{ ) #{table_options} ;} + str.join("\n") + end + + # + # A mysql snippet to bulk load the tab-separated-values file emitted by a + # Wukong script. + # + # Let's say your class is ClickLog; its resource_name is "click_log" + # and thus its table_name is 'click_logs'. sql_load_mysql will: + # + # * disable indexing on the table + # * import the file, replacing any existing rows. (Replacement is governed + # by primary key and unique index constraints -- see the mysql docs). + # * re-enable indexing on that table + # * show the number of + # + # The load portion will + # + # * Load into a table named click_logs + # * from a file named click_logs.tsv + # * where all rows have the string 'click_logs' in their first column + # * and all remaining fields in their #members order + # * assuming strings are wukong_encode'd and so shouldn't be escaped or enclosed. + # + # Why the "LINES STARTING BY" part? For map/reduce outputs that have many + # different objects jumbled together, you can just dump in the whole file, + # landing each object in its correct table. + # + def sql_load_mysql + str = [] + # disable indexing during bulk load + str << %Q{ALTER TABLE `#{self.resource_name}` DISABLE KEYS; } + # Bulk load the tab-separated-values file. + str << %Q{LOAD DATA LOCAL INFILE '#{self.resource_name}.tsv'} + str << %Q{ REPLACE INTO TABLE `#{self.resource_name}` } + str << %Q{ COLUMNS } + str << %Q{ TERMINATED BY '\\t' } + str << %Q{ OPTIONALLY ENCLOSED BY '' } + str << %Q{ ESCAPED BY '' } + str << %Q{ LINES STARTING BY '#{self.resource_name}' } + str << %Q{ ( @dummy,\n } + str << ' '+self.sql_members + str << %Q{\n ); } + # Re-enable indexing + str << %Q{ALTER TABLE `#{self.resource_name}` ENABLE KEYS ; } + # Show it loaded correctly + str << %Q{SELECT '#{self.resource_name}', NOW(), COUNT(*) FROM `#{self.resource_name}`; } + str.join("\n") + end + end + # standard stanza for making methods appear on the class itself on include + def self.included base + base.class_eval{ extend ClassMethods } + end end end -class << Integer ; def to_pig() 'int' end ; end -class << Bignum ; def to_pig() 'long' end ; end -class << Float ; def to_pig() 'float' end ; end -class << String ; def to_pig() 'chararray' end ; end -class << Symbol ; def to_pig() self end ; end -class << Date ; def to_pig() 'long' end ; end +# +# TypedStructs are class-schematizeable +# +Struct.class_eval do include(Wukong::Schema) ; end