Sha256: c35a06f5799f2d3190f73841aa213ae138e392d75edbadf2413273c666993073

Contents?: true

Size: 1.85 KB

Versions: 5

Compression:

Stored size: 1.85 KB

Contents

require "set"

class Masticate::Sniffer < Masticate::Base
  attr_reader :col_sep, :quote_char, :stats
  attr_reader :delimstats

  CandidateDelimiters = [',', '|', "\t", "~"]

  def initialize(filename)
    @filename = filename
  end

  def sniff(opts)
    @col_sep = find_col_sep
    @quote_char = delimstats[@col_sep][:quote_char]
    @stats = stats if opts[:stats]
    {
      :col_sep => @col_sep,
      :quote_char => @quote_char,
      :field_counts => @stats,
      :headers => @line1.split(@col_sep).map(&:strip)
    }
  end

  def find_col_sep
    @delimstats = {}
    with_input do |input|
      input.lines.take(10).each do |line|
        @line1 = line unless @line1

        CandidateDelimiters.each do |delim|
          delimstats[delim] ||= { :counts => Set.new, :quote_char => nil}
          h = delimstats[delim]
          fieldcount, quote_char = consider_delim(line, delim)
          h[:counts] << fieldcount
          h[:quote_char] ||= quote_char
        end
      end
    end
    delimstats.sort_by{|delim,stats| stats[:counts].max || 0}.last.first
  end

  def consider_delim(line, delim)
    @quote_char = nil
    n = count_fields(line, delim)
    [n, @quote_char]
  end

  def count_fields(line, delim)
    if delim == ','
      straight_count = line.count(delim) + 1
      count_with_quoting = begin
        CSV.parse_line(line).count
      rescue CSV::MalformedCSVError
        # this is not valid CSV, e.g. has incorrectly embedded quotes
        0
      end
      if count_with_quoting < straight_count
        @quote_char = '"'
        count_with_quoting
      else
        straight_count
      end
    else
      line.count(delim) + 1
    end
  end

  def stats
    with_input do |input|
      input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :col_sep => col_sep, :quote_char => quote_char || "\0").count] += 1}
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
masticate-0.3 lib/masticate/sniffer.rb
masticate-0.2.3 lib/masticate/sniffer.rb
masticate-0.2.2 lib/masticate/sniffer.rb
masticate-0.2.1 lib/masticate/sniffer.rb
masticate-0.2 lib/masticate/sniffer.rb