#!/usr/bin/env ruby USAGE= %Q{ # h1. wulign -- format a tab-separated file as aligned columns # # wulign will intelligently reformat a tab-separated file into a tab-separated, # space aligned file that is still suitable for further processing. For example, # given the log-file input # # # cat tag_usage.tsv # 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316 # 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956 # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141 # 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414 # 2009-07-21T21:44:29 world 65536 1.09110 32850 200916 # 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025 # 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342 # # wulign will reformat it to read # # # cat tag_usage.tsv | wu-lign # 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316 # 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956 # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141 # 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414 # 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916 # 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025 # 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342 # # The fields are still tab-delimited by exactly one tab -- only spaces are used to # pad out fields. You can still use cuttab and friends to manipulate columns. # # h2. Command-line arguments # # You can give sprintf-style positional arguments on the command line that will be # applied to the corresponding columns. (Blank args are used for placeholding and # auto-formatting is still applied). So with the example above, # # cat foo | wulign '' '' '' '%8.4e' # # will format the fourth column with "%8.4e", while the first three columns and # fifth-and-higher columns are formatted as usual. # # ... # 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956 # 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141 # 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414 # .... # # h2. How it works # # Wu-lign takes the first 500ish lines, splits into fields on TAB characters, # and tries to guess the format (int, float, or string) for each. It builds a # consensus of the width and type for corresponding columns in the chunk. If a # column has mixed numeric and string formats it degrades to :mixed, which is # basically treated as :string. If a column has mixed :float and :int elements all # of them are formatted as float. # # h2. Notes # # * Header rows: the first line is used for width alignment but not for type detection. # This means that an initial row of text headers will inform column spacing # but still allow a column of floats (say) to be properly aligned as floats. # # * It requires a unanimous vote. One screwy line can coerce the whole mess to # :mixed; width formatting will still be applied, though. # # * It won't set columns wider than 100 chars -- this allows for the occasional # super-wide column without completely breaking your screen. # # * For :float values, wulign tries to guess at the right number of significant # digits to the left and right of the decimal point. # # * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab # delimits a field, every newline a record. # # wulign isn't intended to be smart, or correct, or reliable -- only to be # useful for previewing and organizing tab-formatted files. In general # wulign(foo).split("\t").map(&:strip) *should* give output semantically # equivalent to its input. (That is, the only changes should be insertion of # spaces and re-formatting of numerics.) But still -- reserve its use for human # inspection only. # } if ARGV[0] == '--help' puts $0 puts USAGE exit end # # How many initial lines to use to guess formatting. Lines after this are # simply reformatted according to the consensus of the initial # FORMAT_GUESSING_LINES. # FORMAT_GUESSING_LINES = 500 # widest column to set MAX_MAX_WIDTH = 100 INT_RE = /\A[\d,]+\z/ FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/ def get_type val case when val == '' then type = nil when val =~ INT_RE then type = :int when val =~ FLOAT_RE then type = :float else type = :str end end def consensus_type val, alltype, is_first return :mixed if alltype == :mixed type = get_type(val) or return case when alltype.nil? then type when is_first && (alltype == :str) then type when alltype == type then type when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) ) :float else :mixed end end def f_width str str =~ FLOAT_RE or return 0 [$1.length, $2 ? $2.length : 0] end maxw = [] col_types = [] col_minmag = [] col_maxmag = [] rows = [] skip_col = [] has_header = false ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true } FORMAT_GUESSING_LINES.times do line = $stdin.readline rescue nil break unless line row = line.chomp.split("\t").map{|s| s.strip } col_widths = row.map{|col| col.length } col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min } row.each_with_index{|col,i| next if skip_col[i] # Let the first row be text (headers) col_types[i] = consensus_type(col, col_types[i], rows.length == 1) if col_types[i] == :float mantissa, radix = f_width(col) col_minmag[i] = [radix, col_minmag[i], 1].compact.max col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max end } # p [rows.length, has_header, maxw, col_types, col_minmag, col_maxmag, col_widths, row] has_header = true if row.all?{|col| get_type(col) == :str } && rows.length == 0 rows << row end format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default| next(lambda{|s| default % s rescue s }) if default.to_s != '' case type when :mixed, nil then lambda{|s| "%-#{width}s" % s } when :str then lambda{|s| "%-#{width}s" % s } when :int then lambda{|s| "%#{width}d" % s.gsub(/[^\d\-\+]+/, "").to_i } when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.gsub(/[^\d\.eE\-\+]+/, "").to_f } else raise "oops type #{type}" end end def dump_row row, format puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t") end def dump_header row, maxw puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t") end pad = [''] * maxw.length dump_header(rows.shift, maxw) if has_header rows.each do |row| # note -- strips trailing columns dump_row(row, format) end $stdin.each do |line| row = line.chomp.split("\t").map{|s| s.strip } # note -- strips trailing columns dump_row(row, format) end