lib/masticate/mender.rb in masticate-0.0.4 vs lib/masticate/mender.rb in masticate-0.1.0
- old
+ new
@@ -1,63 +1,77 @@
# repair delimited input files
#
# A row that contains fewer delimiters than expected has been split across two lines
# (due to a newline embedded in a field). Glue those two lines into a single line in the output.
-class Masticate::Mender
- attr_reader :input
+class Masticate::Mender < Masticate::Base
+ attr_reader :col_sep
def initialize(filename)
- @input = open(filename)
+ @filename = filename
end
def mend(opts)
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
- col_sep = opts[:col_sep]
+ @col_sep = opts[:col_sep] || ','
- expected_delim_count = nil
- @input_count = @output_count = 0
- while (line = get) do
- unless line =~ /^\s*$/
- if !expected_delim_count
- # trust the first row
- expected_delim_count = line.count(col_sep)
- else
- running_count = line.count(col_sep)
- while !input.eof? && running_count < expected_delim_count do
- nextbit = get
- if nextbit
- line = line + ' ' + nextbit
- running_count = line.count(col_sep)
+ expected_field_count = nil
+ headers = nil
+ @output_count = 0
+ with_input do |input|
+ while (line = get) do
+ unless line =~ /^\s*$/
+ if !expected_field_count
+ # trust the first row
+ headers = explode(line)
+ case opts[:snip]
+ when Fixnum
+ headers.shift(opts[:snip])
+ when nil
+ # do nothing
+ else
+ raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
end
+ expected_field_count = headers.count
+ emit(headers.to_csv(:col_sep => @col_sep))
+ else
+ running_count = fieldcount(line)
+ while !input.eof? && running_count < expected_field_count do
+ nextbit = get
+ if nextbit
+ line = line + ' ' + nextbit
+ running_count = fieldcount(line)
+ end
+ end
+
+ if line.count(col_sep) > 2
+ emit(line)
+ end
end
end
- if line.count(col_sep) > 2
- emit(line)
- end
end
end
- @input.close
@output.close if opts[:output]
{
- :input_records => @input_count,
- :output_records => @output_count
+ :input_count => @input_count,
+ :output_count => @output_count,
+ :headers => headers
}
end
- def get
- line = input.gets
- @input_count += 1
- line && line.chomp
+ def fieldcount(line)
+ if col_sep == ','
+ CSV.parse_line(line).count
+ else
+ line.count(col_sep)+1
+ end
end
- def emit(line)
- @output_count += 1
- begin
- @output.puts line
- rescue Errno::EPIPE
- # output was closed, e.g. ran piped into `head`
- # silently ignore this condition, it's not fatal and doesn't need a warning
+ def explode(line)
+ if col_sep == ','
+ CSV.parse_line(line).map(&:strip)
+ else
+ line.split(col_sep).map(&:strip)
end
end
end