#!/usr/bin/env ruby USAGE= %Q{ # h1. wulign -- format a tab-separated file as aligned columns # # wulign will intelligently reformat a tab-separated file into a tab-separated, # space aligned file that is still suitable for further processing. For example, # given the log-file input # #

#     2009-07-21T21:39:40 day     65536   3.15479 68750   1171316
#     2009-07-21T21:39:45 doing   65536   1.04533 26230   1053956
#     2009-07-21T21:41:53 hapaxlegomenon  65536   0.87574e-05     23707   10051141
#     2009-07-21T21:44:00 concert 500     0.29290 13367   9733414
#     2009-07-21T21:44:29 world   65536   1.09110 32850   200916
#     2009-07-21T21:44:39 world+series    65536   0.49380 9929    7972025
#     2009-07-21T21:44:54 iranelection    65536   2.91775 14592   136342
#     
# # wulign will reformat it to read # #

#     2009-07-21T21:39:40 day                   65536   3.154791234 68750    1171316
#     2009-07-21T21:39:45 doing                 65536   1.045330000 26230    1053956
#     2009-07-21T21:41:53 hapaxlegomenon        65536   0.000008757 23707   10051141
#     2009-07-21T21:44:00 concert                 500   0.292900000 13367    9733414
#     2009-07-21T21:44:29 world                 65536   1.091100000 32850     200916
#     2009-07-21T21:44:39 world+series          65536   0.493800000  9929    7972025
#     2009-07-21T21:44:54 iranelection          65536   2.917750000 14592     136342
#     
# # The fields are still tab-delimited by exactly one tab -- only spaces are used to # pad out fields. You can still use cuttab and friends to manipulate columns. # # wulign isn't intended to be smart, or correct, or reliable -- only to be # useful for previewing and organizing tab-formatted files. In general # @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically # equivalent to its input. (That is, the only changes should be insertion of # spaces and re-formatting of numerics.) But still -- reserve its use for human # inspection only. # # (Note: tab characters in this source code file have been converted to spaces; # replace whitespace with tab in the first example if you'd like to play along at # home.) # # h2. How it works # # Wulign takes the first 1000 lines, splits by TAB characters into fields, and # tries to guess the format -- int, float, or string -- for each. It builds a # consensus of the width and type for corresponding columns in the chunk. If a # column has mixed numeric and string formats it degrades to :mixed, which is # basically treated as :string. If a column has mixed :float and :int elements all # of them are formatted as float. # # h2. Command-line arguments # # You can give sprintf-style positional arguments on the command line that will be # applied to the corresponding columns. (Blank args are used for placeholding and # auto-formatting is still applied). So with the example above, # # @cat foo | wulign '' '' '' '%8.4e'@ # # will format the fourth column with "%8.4e", while the first three columns and # fifth-and-higher columns are formatted as usual. # #

#     ...
#     2009-07-21T21:39:45 doing           65536   1.0453e+00      26230    1053956
#     2009-07-21T21:41:53 hapaxlegomenon  65536   8.7574e-06      23707   10051141
#     2009-07-21T21:44:00 concert           500   2.9290e-01      13367    9733414
#     ....
#     
# # h2. Notes # # * It has no knowledge of header rows. An all-text first line will screw everything up. # # * It also requires a unanimous vote. One screwy line can coerce the whole mess # to :mixed; width formatting will still be applied, though. # # * It won't set columns wider than 70 chars -- this allows for the occasional # super-wide column without completely breaking your screen. # # * For :float values, wulign tries to guess at the right number of significant # digits to the left and right of the decimal point. # # * wulign does not parse 'TSV files' in their strict sense -- there is no quoting # or escaping; every tab delimits a field, every newline a record. } if ARGV[0] == '--help' puts $0 puts USAGE exit end # # How many initial lines to use to guess formatting. Lines after this are # simply reformatted according to the consensus of the initial # FORMAT_GUESSING_LINES. # FORMAT_GUESSING_LINES = 500 # widest column to set MAX_MAX_WIDTH = 70 INT_RE = /\A\d+\z/ FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/ def consensus_type val, alltype return :mixed if alltype == :mixed case when val == '' then type = nil when val =~ INT_RE then type = :int when val =~ FLOAT_RE then type = :float else type = :str end return if ! type case when alltype.nil? then type when alltype == type then type when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) ) :float else :mixed end end def f_width str str =~ FLOAT_RE or return 0 [$1.length, $2 ? $2.length : 0] end maxw = [] col_types = [] col_minmag = [] col_maxmag = [] rows = [] skip_col = [] ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true } FORMAT_GUESSING_LINES.times do line = $stdin.readline rescue nil break unless line cols = line.chomp.split("\t").map{|s| s.strip } col_widths = cols.map{|col| col.length } col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min } cols.each_with_index{|col,i| next if skip_col[i] col_types[i] = consensus_type(col, col_types[i]) if col_types[i] == :float mantissa, radix = f_width(col) col_minmag[i] = [radix, col_minmag[i], 1].compact.max col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max end } # p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols] rows << cols end format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default| next(lambda{|s| default % s rescue s }) if default.to_s != '' case type when :mixed, nil then lambda{|s| "%-#{width}s" % s } when :str then lambda{|s| "%-#{width}s" % s } when :int then lambda{|s| "%#{width}d" % s.to_i } when :float then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f } else raise "oops type #{type}" end end # p [maxw, col_types, col_minmag, col_maxmag, format] pad = [''] * maxw.length rows.each do |row| # note -- strips trailing columns puts row.zip(format).map{|c,f| f.call(c) }.join("\t") end $stdin.each do |line| cols = line.chomp.split("\t").map{|s| s.strip } # note -- strips trailing columns puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t") end