#!/usr/bin/env ruby USAGE= %Q{ # h1. wulign -- format a tab-separated file as aligned columns # # wulign will intelligently reformat a tab-separated file into a tab-separated, # space aligned file that is still suitable for further processing. For example, # given the log-file input # # <pre><code> # 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316 # 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956 # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141 # 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414 # 2009-07-21T21:44:29 world 65536 1.09110 32850 200916 # 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025 # 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342 # </code></pre> # # wulign will reformat it to read # # <pre><code> # 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316 # 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956 # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141 # 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414 # 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916 # 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025 # 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342 # </code></pre> # # The fields are still tab-delimited by exactly one tab -- only spaces are used to # pad out fields. You can still use cuttab and friends to manipulate columns. # # wulign isn't intended to be smart, or correct, or reliable -- only to be # useful for previewing and organizing tab-formatted files. In general # @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically # equivalent to its input. (That is, the only changes should be insertion of # spaces and re-formatting of numerics.) But still -- reserve its use for human # inspection only. # # (Note: tab characters in this source code file have been converted to spaces; # replace whitespace with tab in the first example if you'd like to play along at # home.) # # h2. How it works # # Wulign takes the first 1000 lines, splits by TAB characters into fields, and # tries to guess the format -- int, float, or string -- for each. It builds a # consensus of the width and type for corresponding columns in the chunk. If a # column has mixed numeric and string formats it degrades to :mixed, which is # basically treated as :string. If a column has mixed :float and :int elements all # of them are formatted as float. # # h2. Command-line arguments # # You can give sprintf-style positional arguments on the command line that will be # applied to the corresponding columns. (Blank args are used for placeholding and # auto-formatting is still applied). So with the example above, # # @cat foo | wulign '' '' '' '%8.4e'@ # # will format the fourth column with "%8.4e", while the first three columns and # fifth-and-higher columns are formatted as usual. # # <pre><code> # ... # 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956 # 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141 # 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414 # .... # </code></pre> # # h2. Notes # # * It has no knowledge of header rows. An all-text first line will screw everything up. # # * It also requires a unanimous vote. One screwy line can coerce the whole mess # to :mixed; width formatting will still be applied, though. # # * It won't set columns wider than 70 chars -- this allows for the occasional # super-wide column without completely breaking your screen. # # * For :float values, wulign tries to guess at the right number of significant # digits to the left and right of the decimal point. # # * wulign does not parse 'TSV files' in their strict sense -- there is no quoting # or escaping; every tab delimits a field, every newline a record. } if ARGV[0] == '--help' puts $0 puts USAGE exit end # # How many initial lines to use to guess formatting. Lines after this are # simply reformatted according to the consensus of the initial # FORMAT_GUESSING_LINES. # FORMAT_GUESSING_LINES = 500 # widest column to set MAX_MAX_WIDTH = 100 INT_RE = /\A\d+\z/ FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/ def get_type val case when val == '' then type = nil when val =~ INT_RE then type = :int when val =~ FLOAT_RE then type = :float else type = :str end end def consensus_type val, alltype, is_first return :mixed if alltype == :mixed type = get_type(val) or return case when alltype.nil? then type when is_first && (alltype == :str) then type when alltype == type then type when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) ) :float else :mixed end end def f_width str str =~ FLOAT_RE or return 0 [$1.length, $2 ? $2.length : 0] end maxw = [] col_types = [] col_minmag = [] col_maxmag = [] rows = [] skip_col = [] has_header = false ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true } FORMAT_GUESSING_LINES.times do line = $stdin.readline rescue nil break unless line row = line.chomp.split("\t").map{|s| s.strip } col_widths = row.map{|col| col.length } col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min } row.each_with_index{|col,i| next if skip_col[i] # Let the first row be text (headers) col_types[i] = consensus_type(col, col_types[i], rows.length == 1) if col_types[i] == :float mantissa, radix = f_width(col) col_minmag[i] = [radix, col_minmag[i], 1].compact.max col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max end } # p [rows.length, has_header, maxw, col_types, col_minmag, col_maxmag, col_widths, row] has_header = true if row.all?{|col| get_type(col) == :str } && rows.length == 0 rows << row end format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default| next(lambda{|s| default % s rescue s }) if default.to_s != '' case type when :mixed, nil then lambda{|s| "%-#{width}s" % s } when :str then lambda{|s| "%-#{width}s" % s } when :int then lambda{|s| "%#{width}d" % s.to_i } when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f } else raise "oops type #{type}" end end def dump_row row, format puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t") end def dump_header row, maxw puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t") end pad = [''] * maxw.length dump_header(rows.shift, maxw) if has_header rows.each do |row| # note -- strips trailing columns dump_row(row, format) end $stdin.each do |line| row = line.chomp.split("\t").map{|s| s.strip } # note -- strips trailing columns dump_row(row, format) end