#!/usr/bin/env ruby

USAGE= %Q{
# h1. wulign -- format a tab-separated file as aligned columns
#
# wulign will intelligently reformat a tab-separated file into a tab-separated,
# space aligned file that is still suitable for further processing. For example,
# given the log-file input
#
#     # cat tag_usage.tsv
#     2009-07-21T21:39:40 day     65536   3.15479 68750   1171316
#     2009-07-21T21:39:45 doing   65536   1.04533 26230   1053956
#     2009-07-21T21:41:53 hapaxlegomenon  65536   0.87574e-05     23707   10051141
#     2009-07-21T21:44:00 concert 500     0.29290 13367   9733414
#     2009-07-21T21:44:29 world   65536   1.09110 32850   200916
#     2009-07-21T21:44:39 world+series    65536   0.49380 9929    7972025
#     2009-07-21T21:44:54 iranelection    65536   2.91775 14592   136342
#
# wulign will reformat it to read
#
#     # cat tag_usage.tsv | wu-lign
#     2009-07-21T21:39:40 day                   65536   3.154791234 68750    1171316
#     2009-07-21T21:39:45 doing                 65536   1.045330000 26230    1053956
#     2009-07-21T21:41:53 hapaxlegomenon        65536   0.000008757 23707   10051141
#     2009-07-21T21:44:00 concert                 500   0.292900000 13367    9733414
#     2009-07-21T21:44:29 world                 65536   1.091100000 32850     200916
#     2009-07-21T21:44:39 world+series          65536   0.493800000  9929    7972025
#     2009-07-21T21:44:54 iranelection          65536   2.917750000 14592     136342
#
# The fields are still tab-delimited by exactly one tab -- only spaces are used to
# pad out fields. You can still use cuttab and friends to manipulate columns.
#
# h2. Command-line arguments
#
# You can give sprintf-style positional arguments on the command line that will be
# applied to the corresponding columns. (Blank args are used for placeholding and
# auto-formatting is still applied).  So with the example above,
#
#     cat foo | wulign  '' '' '' '%8.4e'
#
# will format the fourth column with "%8.4e", while the first three columns and
# fifth-and-higher columns are formatted as usual.
#
#     ...
#     2009-07-21T21:39:45 doing           65536   1.0453e+00      26230    1053956
#     2009-07-21T21:41:53 hapaxlegomenon  65536   8.7574e-06      23707   10051141
#     2009-07-21T21:44:00 concert           500   2.9290e-01      13367    9733414
#     ....
#
# h2. How it works
#
# Wu-lign takes the first 500ish lines, splits into fields on TAB characters,
# and tries to guess the format (int, float, or string) for each. It builds a
# consensus of the width and type for corresponding columns in the chunk.  If a
# column has mixed numeric and string formats it degrades to :mixed, which is
# basically treated as :string. If a column has mixed :float and :int elements all
# of them are formatted as float.
#
# h2. Notes
#
# * Header rows: the first line is used for width alignment but not for type detection.
#   This means that an initial row of text headers will inform column spacing
#   but still allow a column of floats (say) to be properly aligned as floats.
#
# * It requires a unanimous vote. One screwy line can coerce the whole mess to
#   :mixed; width formatting will still be applied, though.
#
# * It won't set columns wider than 100 chars -- this allows for the occasional
#   super-wide column without completely breaking your screen.
#
# * For :float values, wulign tries to guess at the right number of significant
#   digits to the left and right of the decimal point.
#
# * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab
#   delimits a field, every newline a record.
#
# wulign isn't intended to be smart, or correct, or reliable -- only to be
# useful for previewing and organizing tab-formatted files. In general
# wulign(foo).split("\t").map(&:strip) *should* give output semantically
# equivalent to its input. (That is, the only changes should be insertion of
# spaces and re-formatting of numerics.) But still -- reserve its use for human
# inspection only.
#
}

if ARGV[0] == '--help'
  puts $0
  puts USAGE
  exit
end

#
# How many initial lines to use to guess formatting.  Lines after this are
# simply reformatted according to the consensus of the initial
# FORMAT_GUESSING_LINES.
#
FORMAT_GUESSING_LINES = 500
# widest column to set
MAX_MAX_WIDTH = 100

INT_RE   = /\A\d+\z/
FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/

def get_type val
  case
  when val == ''       then type = nil
  when val =~ INT_RE   then type = :int
  when val =~ FLOAT_RE then type = :float
  else                      type = :str end
end

def consensus_type val, alltype, is_first
  return :mixed if alltype == :mixed
  type = get_type(val) or return
  case
  when alltype.nil?                  then type
  when is_first && (alltype == :str) then type
  when alltype == type               then type
  when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
    :float
  else :mixed
  end
end

def f_width str
  str =~ FLOAT_RE or return 0
  [$1.length, $2 ? $2.length : 0]
end

maxw       = []
col_types  = []
col_minmag = []
col_maxmag = []
rows       = []
skip_col   = []
has_header = false
ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
FORMAT_GUESSING_LINES.times do
  line = $stdin.readline rescue nil
  break unless line
  row = line.chomp.split("\t").map{|s| s.strip }
  col_widths = row.map{|col| col.length }
  col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
  row.each_with_index{|col,i|
    next if skip_col[i]
    # Let the first row be text (headers)
    col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
    if col_types[i] == :float
      mantissa, radix = f_width(col)
      col_minmag[i] = [radix,    col_minmag[i], 1].compact.max
      col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
    end
  }
  # p [rows.length, has_header, maxw, col_types, col_minmag, col_maxmag, col_widths, row]
  has_header = true if row.all?{|col| get_type(col) == :str } && rows.length == 0
  rows << row
end

format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
  next(lambda{|s| default % s rescue s }) if default.to_s != ''
  case type
  when :mixed, nil then lambda{|s| "%-#{width}s" % s }
  when :str        then lambda{|s| "%-#{width}s" % s }
  when :int        then lambda{|s| "%#{width}d"  % s.to_i }
  when :float      then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f }
  else raise "oops type #{type}"  end
end

def dump_row row, format
  puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
end
def dump_header row, maxw
  puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
end

pad = [''] * maxw.length
dump_header(rows.shift, maxw) if has_header
rows.each do |row|
  # note -- strips trailing columns
  dump_row(row, format)
end
$stdin.each do |line|
  row = line.chomp.split("\t").map{|s| s.strip }
  # note -- strips trailing columns
  dump_row(row, format)
end