lib/data_frame.rb in davidrichards-data_frame-0.0.15 vs lib/data_frame.rb in davidrichards-data_frame-0.0.17

- old
+ new

@@ -17,307 +17,8 @@ $:.unshift(File.dirname(__FILE__)) require 'data_frame/callback_array' require 'data_frame/transposable_array' - -# This allows me to have named columns and optionally named rows in a -# data frame, to work calculations (usually on the columns), to -# transpose the matrix and store the transposed matrix until the object -# is tainted. -class DataFrame - - class << self - - # This is the neatest part of this neat gem. - # DataFrame.from_csv can be called in a lot of ways: - # DataFrame.from_csv(csv_contents) - # DataFrame.from_csv(filename) - # DataFrame.from_csv(url) - # If you need to define converters for FasterCSV, do it before calling - # this method: - # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'} - # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special) - # This returns bar where 'foo' was found and 'foo' everywhere else. - def from_csv(obj, opts={}) - labels, table = infer_csv_contents(obj, opts) - return nil unless labels and table - df = new(*labels) - df.import(table) - df - end - - protected - def infer_csv_contents(obj, opts={}) - contents = File.read(obj) if File.exist?(obj) - begin - open(obj) {|f| contents = f.read} unless contents - rescue - nil - end - contents ||= obj if obj.is_a?(String) - return nil unless contents - table = FCSV.parse(contents, default_csv_opts.merge(opts)) - labels = table.shift - while table.last.empty? - table.pop - end - [labels, table] - end - - def default_csv_opts; {:converters => :all}; end - end - - # Loads a batch of rows. Expects an array of arrays, else you don't - # know what you have. - def import(rows) - rows.each do |row| - self.add_item(row) - end - end - - def inspect - "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}" - end - - # The labels of the data items - attr_reader :labels - alias :variables :labels - - # The items stored in the frame - attr_reader :items - - def initialize(*labels) - @labels = labels.map {|e| e.to_underscore_sym } - @items = TransposableArray.new - end - - def add_item(item) - self.items << item - end - alias :add :add_item - - def row_labels - @row_labels ||= [] - end - - def row_labels=(ary) - raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array) - @row_labels = ary - end - - def render_column(sym) - i = @labels.index(sym) - return nil unless i - @items.transpose[i] - end - - # The rows as an array of arrays, an alias for items. - alias :rows :items - - # The columns as a Dictionary or Hash - # This is cached, call columns(true) to reset the cache. - def columns(reset=false) - @columns = nil if reset - return @columns if @columns - - container = defined?(Dictionary) ? Dictionary.new : Hash.new - i = 0 - - @columns = @items.transpose.inject(container) do |cont, col| - cont[@labels[i]] = col - i += 1 - cont - end - end - alias :to_hash :columns - alias :to_dictionary :columns - - def render_row(sym) - i = self.row_labels.index(sym) - return nil unless i - @items[i] - end - - def method_missing(sym, *args, &block) - if self.labels.include?(sym) - render_column(sym) - elsif self.row_labels.include?(sym) - render_row(sym) - elsif @items.respond_to?(sym) - @items.send(sym, *args, &block) - else - super - end - end - - def drop!(*labels) - labels.each do |label| - drop_one!(label) - end - self - end - - def drop_one!(label) - i = self.labels.index(label) - return nil unless i - self.items.each do |item| - item.delete_at(i) - end - self.labels.delete_at(i) - self - end - protected :drop_one! - - def replace!(column, values=nil, &block) - column = validate_column(column) - if not values - values = self.send(column) - values.map! {|e| block.call(e)} - end - replace_column(column, values) - self - end - - def replace_column(column, values) - column = validate_column(column) - index = self.labels.index(column) - list = [] - self.items.each_with_index do |item, i| - consolidated = item - consolidated[index] = values[i] - list << consolidated - end - @items = list.dup - end - protected :replace_column - - def validate_column(column) - column = column.to_sym - raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column) - column - end - protected :validate_column - - # Takes a block to evaluate on each row. The row can be converted into - # an OpenStruct or a Hash for easier filter methods. Note, don't try this - # with a hash or open struct unless you have facets available. - def filter!(as=Array, &block) - as = infer_class(as) - items = [] - self.items.each do |row| - value = block.call(cast_row(row, as)) - items << row if value - end - @items = items.dup - self - end - - def filter(as=Array, &block) - new_data_frame = self.clone - new_data_frame.filter!(as, &block) - end - - def infer_class(obj) - obj = obj.to_s.classify.constantize if obj.is_a?(Symbol) - obj = obj.classify.constantize if obj.is_a?(String) - obj - end - protected :infer_class - - def cast_row(row, as) - if as == Hash - obj = {} - self.labels.each_with_index do |label, i| - obj[label] = row[i] - end - obj - elsif as == OpenStruct - obj = OpenStruct.new - self.labels.each_with_index do |label, i| - obj.table[label] = row[i] - end - obj - elsif as == Array - row - else - as.new(*row) - end - end - protected :cast_row - - # Creates a new data frame, only with the specified columns. - def subset_from_columns(*cols) - new_labels = self.labels.inject([]) do |list, label| - list << label if cols.include?(label) - list - end - new_data_frame = DataFrame.new(*self.labels) - new_data_frame.import(self.items) - self.labels.each do |label| - new_data_frame.drop!(label) unless new_labels.include?(label) - end - new_data_frame - end - - # A weird name. This creates a column for every category in a column - # and marks each row by its value - def j_binary_ize!(*columns) - # Allows to mix a hash with the columns. - options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)} - columns.delete_if {|e| e.is_a?(Hash)} - - # Generates new columns - columns.each do |col| - values = render_column(col.to_underscore_sym) - values.categories.each do |category| - full_name = (col.to_s + "_" + category.to_s).to_sym - if options[:allow_overlap] - category_map = values.inject([]) do |list, e| - list << values.all_categories(e) - end - self.append!(full_name, category_map.map{|e| e.include?(category)}) - else - self.append!(full_name, values.category_map.map{|e| e == category}) - end - end - end - end - - # Adds a unique column to the table - def append!(column_name, value=nil) - raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name) - self.labels << column_name.to_underscore_sym - if value.is_a?(Array) - self.items.each_with_index do |item, i| - item << value[i] - end - else - self.items.each do |item| - item << value - end - end - # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed. - self.items.taint - end - - def filter_by_category(hash) - new_data_frame = self.dup - hash.each do |key, value| - key = key.to_underscore_sym - next unless self.labels.include?(key) - value = [value] unless value.is_a?(Array) or value.is_a?(Range) - new_data_frame.filter!(:hash) {|row| value.include?(row[key])} - end - new_data_frame - end - - def filter_by_category!(hash) - hash.each do |key, value| - key = key.to_underscore_sym - next unless self.labels.include?(key) - value = [value] unless value.is_a?(Array) or value.is_a?(Range) - self.filter!(:hash) {|row| value.include?(row[key])} - end - end - -end \ No newline at end of file +require 'data_frame/parameter_capture' +require 'data_frame/data_frame' +require 'data_frame/model'