lib/masticate/mender.rb in masticate-0.1.1 vs lib/masticate/mender.rb in masticate-0.1.3
- old
+ new
@@ -16,26 +16,29 @@
@quote_char = opts[:quote_char] || "\0"
expected_field_count = nil
headers = nil
@output_count = 0
+ fieldcounts = Hash.new(0)
with_input do |input|
while (line = get) do
unless line =~ /^\s*$/
if opts[:inlined]
row = explode(line)
ncells = row.count/2-1
if !expected_field_count
headers = row[0..ncells]
expected_field_count = headers.count
+ fieldcounts[headers.count] += 1
emit(headers.to_csv(:col_sep => @col_sep))
else
if row[0..ncells] != headers
raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
end
end
- row = row[ncells+1..-1]
+ row = row[ncells+1, expected_field_count]
+ fieldcounts[row.count] += 1
emit(row.to_csv(:col_sep => @col_sep))
elsif !expected_field_count
# trust the first row
headers = explode(line).map(&:strip)
case opts[:snip]
@@ -47,10 +50,11 @@
# do nothing
else
raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
end
expected_field_count = headers.count
+ fieldcounts[headers.count] += 1
emit(headers.to_csv(:col_sep => @col_sep))
else
running_count = fieldcount(line)
while !input.eof? && running_count < expected_field_count do
nextbit = get
@@ -59,10 +63,11 @@
running_count = fieldcount(line)
end
end
unless opts[:dejunk] && junky?(line)
+ fieldcounts[fieldcount(line)] += 1
emit(line)
end
end
end
end
@@ -70,19 +75,20 @@
@output.close if opts[:output]
{
:input_count => @input_count,
:output_count => @output_count,
+ :field_counts => fieldcounts,
:headers => headers
}
end
def fieldcount(line)
explode(line).count
end
def explode(line)
- CSV.parse_line(line, :col_sep => col_sep, :quote_char => @quote_char)
+ CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
end
# a line is "junky" if it has 2 or fewer fields with any content
def junky?(line)
explode(line).select {|s| s && !s.strip.empty?}.count <= 2