#!/usr/bin/env ruby USAGE= %Q{ # h1. wulign -- format a tab-separated file as aligned columns # # wulign will intelligently reformat a tab-separated file into a tab-separated, # space aligned file that is still suitable for further processing. For example, # given the log-file input # #
# 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
# 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
# 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
# 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
# 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
# 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
#
#
# wulign will reformat it to read
#
#
# 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
# 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
# 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
# 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
# 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
# 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
#
#
# The fields are still tab-delimited by exactly one tab -- only spaces are used to
# pad out fields. You can still use cuttab and friends to manipulate columns.
#
# wulign isn't intended to be smart, or correct, or reliable -- only to be
# useful for previewing and organizing tab-formatted files. In general
# @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
# equivalent to its input. (That is, the only changes should be insertion of
# spaces and re-formatting of numerics.) But still -- reserve its use for human
# inspection only.
#
# (Note: tab characters in this source code file have been converted to spaces;
# replace whitespace with tab in the first example if you'd like to play along at
# home.)
#
# h2. How it works
#
# Wulign takes the first 1000 lines, splits by TAB characters into fields, and
# tries to guess the format -- int, float, or string -- for each. It builds a
# consensus of the width and type for corresponding columns in the chunk. If a
# column has mixed numeric and string formats it degrades to :mixed, which is
# basically treated as :string. If a column has mixed :float and :int elements all
# of them are formatted as float.
#
# h2. Command-line arguments
#
# You can give sprintf-style positional arguments on the command line that will be
# applied to the corresponding columns. (Blank args are used for placeholding and
# auto-formatting is still applied). So with the example above,
#
# @cat foo | wulign '' '' '' '%8.4e'@
#
# will format the fourth column with "%8.4e", while the first three columns and
# fifth-and-higher columns are formatted as usual.
#
#
# ...
# 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
# 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
# 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
# ....
#
#
# h2. Notes
#
# * It has no knowledge of header rows. An all-text first line will screw everything up.
#
# * It also requires a unanimous vote. One screwy line can coerce the whole mess
# to :mixed; width formatting will still be applied, though.
#
# * It won't set columns wider than 70 chars -- this allows for the occasional
# super-wide column without completely breaking your screen.
#
# * For :float values, wulign tries to guess at the right number of significant
# digits to the left and right of the decimal point.
#
# * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
# or escaping; every tab delimits a field, every newline a record.
}
if ARGV[0] == '--help'
puts $0
puts USAGE
exit
end
#
# How many initial lines to use to guess formatting. Lines after this are
# simply reformatted according to the consensus of the initial
# FORMAT_GUESSING_LINES.
#
FORMAT_GUESSING_LINES = 500
# widest column to set
MAX_MAX_WIDTH = 70
INT_RE = /\A\d+\z/
FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
def consensus_type val, alltype
return :mixed if alltype == :mixed
case
when val == '' then type = nil
when val =~ INT_RE then type = :int
when val =~ FLOAT_RE then type = :float
else type = :str end
return if ! type
case
when alltype.nil? then type
when alltype == type then type
when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
:float
else :mixed
end
end
def f_width str
str =~ FLOAT_RE or return 0
[$1.length, $2 ? $2.length : 0]
end
maxw = []
col_types = []
col_minmag = []
col_maxmag = []
rows = []
skip_col = []
ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
FORMAT_GUESSING_LINES.times do
line = $stdin.readline rescue nil
break unless line
cols = line.chomp.split("\t").map{|s| s.strip }
col_widths = cols.map{|col| col.length }
col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
cols.each_with_index{|col,i|
next if skip_col[i]
col_types[i] = consensus_type(col, col_types[i])
if col_types[i] == :float
mantissa, radix = f_width(col)
col_minmag[i] = [radix, col_minmag[i], 1].compact.max
col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
end
}
# p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols]
rows << cols
end
format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
next(lambda{|s| default % s rescue s }) if default.to_s != ''
case type
when :mixed, nil then lambda{|s| "%-#{width}s" % s }
when :str then lambda{|s| "%-#{width}s" % s }
when :int then lambda{|s| "%#{width}d" % s.to_i }
when :float then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f }
else raise "oops type #{type}" end
end
# p [maxw, col_types, col_minmag, col_maxmag, format]
pad = [''] * maxw.length
rows.each do |row|
# note -- strips trailing columns
puts row.zip(format).map{|c,f| f.call(c) }.join("\t")
end
$stdin.each do |line|
cols = line.chomp.split("\t").map{|s| s.strip }
# note -- strips trailing columns
puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
end