lib/masticate/mender.rb in masticate-0.0.4 vs lib/masticate/mender.rb in masticate-0.1.0

- old
+ new

@@ -1,63 +1,77 @@ # repair delimited input files # # A row that contains fewer delimiters than expected has been split across two lines # (due to a newline embedded in a field). Glue those two lines into a single line in the output. -class Masticate::Mender - attr_reader :input +class Masticate::Mender < Masticate::Base + attr_reader :col_sep def initialize(filename) - @input = open(filename) + @filename = filename end def mend(opts) @output = opts[:output] ? File.open(opts[:output], "w") : $stdout - col_sep = opts[:col_sep] + @col_sep = opts[:col_sep] || ',' - expected_delim_count = nil - @input_count = @output_count = 0 - while (line = get) do - unless line =~ /^\s*$/ - if !expected_delim_count - # trust the first row - expected_delim_count = line.count(col_sep) - else - running_count = line.count(col_sep) - while !input.eof? && running_count < expected_delim_count do - nextbit = get - if nextbit - line = line + ' ' + nextbit - running_count = line.count(col_sep) + expected_field_count = nil + headers = nil + @output_count = 0 + with_input do |input| + while (line = get) do + unless line =~ /^\s*$/ + if !expected_field_count + # trust the first row + headers = explode(line) + case opts[:snip] + when Fixnum + headers.shift(opts[:snip]) + when nil + # do nothing + else + raise "Do not understand snip instruction [#{opts[:snip].inspect}]" end + expected_field_count = headers.count + emit(headers.to_csv(:col_sep => @col_sep)) + else + running_count = fieldcount(line) + while !input.eof? && running_count < expected_field_count do + nextbit = get + if nextbit + line = line + ' ' + nextbit + running_count = fieldcount(line) + end + end + + if line.count(col_sep) > 2 + emit(line) + end end end - if line.count(col_sep) > 2 - emit(line) - end end end - @input.close @output.close if opts[:output] { - :input_records => @input_count, - :output_records => @output_count + :input_count => @input_count, + :output_count => @output_count, + :headers => headers } end - def get - line = input.gets - @input_count += 1 - line && line.chomp + def fieldcount(line) + if col_sep == ',' + CSV.parse_line(line).count + else + line.count(col_sep)+1 + end end - def emit(line) - @output_count += 1 - begin - @output.puts line - rescue Errno::EPIPE - # output was closed, e.g. ran piped into `head` - # silently ignore this condition, it's not fatal and doesn't need a warning + def explode(line) + if col_sep == ',' + CSV.parse_line(line).map(&:strip) + else + line.split(col_sep).map(&:strip) end end end