require 'extlib/inflection' require 'wukong' # # Basic types: SQL conversion # class << Integer ; def to_sql() 'INT' end ; end class << Bignum ; def to_sql() 'BIGINT' end ; end class << String ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end class << Symbol ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end class << BigDecimal ; def to_pig() 'DECIMAL' end ; end if defined?(BigDecimal) class << EpochTime ; def to_pig() 'INT' end ; end if defined?(EpochTime) class << FilePath ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath) class << Flag ; def to_pig() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag) class << IPAddress ; def to_pig() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress) class << URI ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI) class << Csv ; def to_pig() 'TEXT' end ; end if defined?(Csv) class << Yaml ; def to_pig() 'TEXT' end ; end if defined?(Yaml) class << Json ; def to_pig() 'TEXT' end ; end if defined?(Json) class << Regex ; def to_pig() 'TEXT' end ; end if defined?(Regex) class String ; def to_sql() self ; end ; end class Symbol ; def to_sql() self.to_s.upcase ; end ; end # # Basic types: Pig conversion # class << Integer ; def to_pig() 'int' end ; end class << Bignum ; def to_pig() 'long' end ; end class << Float ; def to_pig() 'float' end ; end class << Symbol ; def to_pig() 'chararray' end ; end class << Date ; def to_pig() 'long' end ; end class << Time ; def to_pig() 'long' end ; end class << DateTime ; def to_pig() 'long' end ; end class << String ; def to_pig() 'chararray' end ; end class << Text ; def to_pig() 'chararray' end ; end if defined?(Text) class << Blob ; def to_pig() 'bytearray' end ; end if defined?(Blob) class << Boolean ; def to_pig() 'bytearray' end ; end if defined?(Boolean) class String ; def to_pig() self.to_s ; end ; end class Symbol ; def to_pig() self.to_s ; end ; end class << BigDecimal ; def to_pig() 'long' end ; end if defined?(BigDecimal) class << EpochTime ; def to_pig() 'integer' end ; end if defined?(EpochTime) class << FilePath ; def to_pig() 'chararray' end ; end if defined?(FilePath) class << Flag ; def to_pig() 'chararray' end ; end if defined?(Flag) class << IPAddress ; def to_pig() 'chararray' end ; end if defined?(IPAddress) class << URI ; def to_pig() 'chararray' end ; end if defined?(URI) class << Csv ; def to_pig() 'chararray' end ; end if defined?(Csv) class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml) class << Json ; def to_pig() 'chararray' end ; end if defined?(Json) class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex) module Wukong # # Export model's structure for loading and manipulating in other frameworks, # such as SQL and Pig # # Your class should support the #resource_name and #mtypes methods # An easy way to do this is by being a TypedStruct. # # You can use this to do silly stunts like # # % ruby -rubygems -r'wukong/schema' -e 'require "/path/to/user_model.rb" ; puts User.pig_load ; ' # # If you include the classes from Wukong::Datatypes::MoreTypes, you can draw # on a richer set of type definitions # # require 'wukong/datatypes/more_types' # include Wukong::Datatypes::MoreTypes # require 'wukong/schema' # # (if you're using Wukong to bulk-process Datamapper records, these should # fall right in line as well -- make sure *not* to include # Wukong::Datatypes::MoreTypes, and to require 'dm-more' before 'wukong/schema') # module Schema module ClassMethods # # Table name for this class # def table_name resource_name.to_s.pluralize end # =========================================================================== # # Pig # # Export schema as Pig # # Won't correctly handle complex types (struct having struct as member, eg) # def to_pig members.zip(mtypes).map do |member, type| member.to_s + ': ' + type.to_pig end.join(', ') end # # A pig snippet to load a tsv file containing # serialized instances of this class. # # Assumes the first column is the resource name (you can, and probably # should, follow with an immediate GENERATE to ditch that field.) # def pig_load filename=nil filename ||= table_name+'.tsv' cmd = [ "%-23s" % resource_name, "= LOAD", filename, "AS ( rsrc:chararray,", self.to_pig, ')', ].join(" ") end # =========================================================================== # # SQL # # Schema definition for use in a CREATE TABLE statement # def to_sql sql_str = [] members.zip(mtypes).each do |attr, type| type_str = type.respond_to?(:to_sql) ? type.to_sql : type.to_s.upcase sql_str << " %-21s\t%s" %["`#{attr}`", type_str] end sql_str.join(",\n") end # # List off member names, to be stuffed into a SELECT or a LOAD DATA # def sql_members members.map{|attr| "`#{attr}`" }.join(", ") end # # Creates a table for the wukong class. # # * primary_key gives the name of one column to be set as the primary key # # * if drop_first is given, a "DROP TABLE IF EXISTS" statement will # precede the snippet. # # * table_options sets the table parameters. Useful table_options for a # read-only database in MySQL: # ENGINE=MyISAM PACK_KEYS=0 # def sql_create_table primary_key=nil, drop_first=nil, table_options='' str = [] str << %Q{DROP TABLE IF EXISTS `#{self.table_name}`; } if drop_first str << %Q{CREATE TABLE `#{self.table_name}` ( } str << self.to_sql if primary_key then str.last << ',' ; str << %Q{ PRIMARY KEY \t(`#{primary_key}`)} ; end str << %Q{ ) #{table_options} ;} str.join("\n") end # # A mysql snippet to bulk load the tab-separated-values file emitted by a # Wukong script. # # Let's say your class is ClickLog; its resource_name is "click_log" # and thus its table_name is 'click_logs'. sql_load_mysql will: # # * disable indexing on the table # * import the file, replacing any existing rows. (Replacement is governed # by primary key and unique index constraints -- see the mysql docs). # * re-enable indexing on that table # * show the number of # # The load portion will # # * Load into a table named click_logs # * from a file named click_logs.tsv # * where all rows have the string 'click_logs' in their first column # * and all remaining fields in their #members order # * assuming strings are wukong_encode'd and so shouldn't be escaped or enclosed. # # Why the "LINES STARTING BY" part? For map/reduce outputs that have many # different objects jumbled together, you can just dump in the whole file, # landing each object in its correct table. # def sql_load_mysql str = [] # disable indexing during bulk load str << %Q{ALTER TABLE `#{self.resource_name}` DISABLE KEYS; } # Bulk load the tab-separated-values file. str << %Q{LOAD DATA LOCAL INFILE '#{self.resource_name}.tsv'} str << %Q{ REPLACE INTO TABLE `#{self.resource_name}` } str << %Q{ COLUMNS } str << %Q{ TERMINATED BY '\\t' } str << %Q{ OPTIONALLY ENCLOSED BY '' } str << %Q{ ESCAPED BY '' } str << %Q{ LINES STARTING BY '#{self.resource_name}' } str << %Q{ ( @dummy,\n } str << ' '+self.sql_members str << %Q{\n ); } # Re-enable indexing str << %Q{ALTER TABLE `#{self.resource_name}` ENABLE KEYS ; } # Show it loaded correctly str << %Q{SELECT '#{self.resource_name}', NOW(), COUNT(*) FROM `#{self.resource_name}`; } str.join("\n") end end # standard stanza for making methods appear on the class itself on include def self.included base base.class_eval{ extend ClassMethods } end end end # # TypedStructs are class-schematizeable # Struct.class_eval do include(Wukong::Schema) ; end