# frozen_string_literal: true require 'stringio' module RedAmber # mix-ins for the class DataFrame module DataFrameDisplayable INDEX_KEY = :index_key_for_format_table def to_s(width: 80) return '' if empty? format_table(width: width) end # Show statistical summary by a new DatFrame. # Make stats for numeric columns only. # NaNs are ignored. # Counts also show non-NaN counts. # # @return [DataFrame] a new dataframe. def summary num_keys = keys.select { |key| self[key].numeric? } DataFrame.new( variables: num_keys, count: num_keys.map { |k| self[k].count }, mean: num_keys.map { |k| self[k].mean }, std: num_keys.map { |k| self[k].std }, min: num_keys.map { |k| self[k].min }, '25%': num_keys.map { |k| self[k].quantile(0.25) }, median: num_keys.map { |k| self[k].median }, '75%': num_keys.map { |k| self[k].quantile(0.75) }, max: num_keys.map { |k| self[k].max } ) end alias_method :describe, :summary def inspect mode = ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') case mode.upcase when 'TDR' "#<#{shape_str(with_id: true)}>\n#{dataframe_info(3)}" when 'MINIMUM' shape_str else "#<#{shape_str(with_id: true)}>\n#{self}" end end # - limit: max num of Vectors to show # - tally: max level to use tally mode # - elements: max element to show values in each vector def tdr(limit = 10, tally: 5, elements: 5) puts tdr_str(limit, tally: tally, elements: elements) end def tdr_str(limit = 10, tally: 5, elements: 5) "#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}" end def to_iruby require 'iruby' return ['text/plain', '(empty DataFrame)'] if empty? mode = ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') case mode.upcase when 'PLAIN' ['text/plain', inspect] when 'MINIMUM' ['text/plain', shape_str] when 'TDR' size <= 5 ? ['text/plain', tdr_str(tally: 0)] : ['text/plain', tdr_str] else # 'TABLE' ['text/html', html_table] end end private # ===== def shape_str(with_id: false) shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}" id = with_id ? format(', 0x%016x', object_id) : '' "#{self.class} : #{shape_info}#{id}" end def dataframe_info(limit, tally_level: 5, max_element: 5) return '' if empty? limit = n_keys if [:all, -1].include? limit tallys = vectors.map(&:tally) levels = tallys.map(&:size) type_groups = @table.columns.map { |column| type_group(column.data_type) } quoted_keys = keys.map(&:inspect) headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' } header_format = make_header_format(levels, headers, quoted_keys) sio = StringIO.new # output string buffer sio.puts "Vector#{pl(n_keys)} : #{var_type_count(type_groups).join(', ')}" sio.printf header_format, *headers.values vectors.each.with_index do |vector, i| if i >= limit sio << " ... #{n_keys - i} more Vector#{pl(n_keys - i)} ...\n" break end key = quoted_keys[i] type = types[i] type_group = type_groups[i] data_tally = tallys[i] a = case type_group when :numeric, :string, :boolean if data_tally.size <= tally_level && data_tally.size != size [data_tally.to_s] else [shorthand(vector, size, max_element)].concat na_string(vector) end else [shorthand(vector, size, max_element)] end sio.printf header_format, i, key, type, data_tally.size, a.join(', ') end sio.string end def make_header_format(levels, headers, quoted_keys) # find longest word to adjust width w_idx = n_keys.to_s.size w_key = [quoted_keys.map(&:size).max, headers[:key].size].max w_type = [types.map(&:size).max, headers[:type].size].max w_level = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max "%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_level}s %s\n" end def type_group(data_type) case data_type when Arrow::NumericDataType then :numeric when Arrow::StringDataType then :string when Arrow::BooleanDataType then :boolean when Arrow::TemporalDataType then :temporal else :other end end def var_type_count(type_groups) tg = type_groups.tally a = [] a << "#{tg[:numeric]} numeric" if tg[:numeric] a << "#{tg[:string]} string#{pl(tg[:string])}" if tg[:string] a << "#{tg[:boolean]} boolean" if tg[:boolean] a << "#{tg[:temporal]} temporal" if tg[:temporal] a end def shorthand(vector, size, max_element) max = vector.temporal? ? 2 : max_element a = vector.to_a.take(max) a.map! { |e| e.nil? ? 'nil' : e.inspect } a << '... ' if size > max "[#{a.join(', ')}]" end def na_string(vector) n_nan = vector.n_nans n_nil = vector.n_nils a = [] return a if (n_nan + n_nil).zero? a << "#{n_nan} NaN#{pl(n_nan)}" unless n_nan.zero? a << "#{n_nil} nil#{pl(n_nil)}" unless n_nil.zero? a end def format_table(width: 80, head: 5, tail: 3, n_digit: 2) return " #{keys.join(' ')}\n (Empty Vectors)\n" if size.zero? original = self indices = size > head + tail ? [*0..head, *(size - tail)...size] : [*0...size] df = slice(indices).assign do assigner = { INDEX_KEY => indices.map(&:to_s) } vectors.each_with_object(assigner) do |v, a| a[v.key] = v.to_a.map do |e| if e.nil? '(nil)' elsif v.float? e.round(n_digit).to_s elsif v.string? e else e.to_s end end end end df = df.pick { [INDEX_KEY, keys - [INDEX_KEY]] } df = size > head + tail ? df[0, 0, 0..head, -tail..-1] : df[0, 0, 0..-1] df = df.assign do vectors.each_with_object({}) do |v, assigner| vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s) .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>") assigner[v.key] = original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec end end width_list = df.vectors.map { |v| v.to_a.map(&:length).max } total_length = width_list[-1] # reserved for last column formats = [] row_ellipsis = nil df.vectors.each_with_index do |v, i| w = width_list[i] if total_length + w > width && i < df.n_keys - 1 row_ellipsis = i formats << '%3s' formats << format_for_column(df.vectors[-1], original, width_list[-1]) break end formats << format_for_column(v, original, w) total_length += w end format_str = formats.join(' ') str = StringIO.new if row_ellipsis df = df[df.keys[0..row_ellipsis], df.keys[-1]] df = df.assign(df.keys[row_ellipsis] => ['...'] * df.size) end df.to_a.each do |row| str.puts format(format_str, *row).rstrip end str.string end def format_for_column(vector, original, width) if vector.key != INDEX_KEY && !original[vector.key].numeric? "%-#{width}s" else "%#{width}s" end end def html_table reduced = size > 8 ? self[0..4, -4..-1] : self converted = reduced.assign do vectors.select.with_object({}) do |vector, assigner| assigner[vector.key] = vector.map do |element| case element in TrueClass '(true)' in FalseClass '(false)' in NilClass '(nil)' in '' '""' in String element.sub(/^(\s+)$/, '"\1"') # blank spaces in Float format('%g', element) in Integer format('%d', element) end end end end html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15) "#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}" end end end