Sha256: 40d80996f2b0cd594919ab9da5260a3804be201529ae117e9eaa20936c1a9787

Contents?: true

Size: 1.89 KB

Versions: 10

Compression:

Stored size: 1.89 KB

Contents

require "set"

class Masticate::Sniffer < Masticate::Base
  attr_reader :col_sep, :quote_char, :stats
  attr_reader :delimstats

  CandidateDelimiters = [',', '|', "\t", "~"]

  def initialize(filename)
    @filename = filename
  end

  def sniff(opts)
    @col_sep = find_col_sep
    @quote_char = delimstats[@col_sep][:quote_char]
    @stats = stats if opts[:stats]
    {
      :col_sep => @col_sep,
      :quote_char => @quote_char,
      :field_counts => @stats,
      :headers => @line1.split(@col_sep).map(&:strip)
    }
  end

  def find_col_sep
    @delimstats = {}
    with_input do |input|
      lines = 10.times.map{get}.compact
      lines.each do |line|
        @line1 = line unless @line1

        CandidateDelimiters.each do |delim|
          delimstats[delim] ||= { :counts => Set.new, :quote_char => nil}
          h = delimstats[delim]
          fieldcount, quote_char = consider_delim(line, delim)
          h[:counts] << fieldcount
          h[:quote_char] ||= quote_char
        end
      end
    end
    delimstats.sort_by{|delim,stats| stats[:counts].max || 0}.last.first
  end

  def consider_delim(line, delim)
    @quote_char = nil
    n = count_fields(line, delim)
    [n, @quote_char]
  end

  def count_fields(line, delim)
    if delim == ','
      straight_count = line.count(delim) + 1
      count_with_quoting = begin
        CSV.parse_line(line).count
      rescue CSV::MalformedCSVError
        # this is not valid CSV, e.g. has incorrectly embedded quotes
        0
      end
      if count_with_quoting < straight_count
        @quote_char = '"'
        count_with_quoting
      else
        straight_count
      end
    else
      line.count(delim) + 1
    end
  end

  def stats
    counts = Hash.new(0)
    with_input do |input|
      while line = get
        counts[CSV.parse_line(line, :col_sep => col_sep, :quote_char => quote_char || "\0").count] += 1
      end
    end
    counts
  end
end

Version data entries

10 entries across 10 versions & 1 rubygems

Version Path
masticate-0.6.2 lib/masticate/sniffer.rb
masticate-0.6.1 lib/masticate/sniffer.rb
masticate-0.6.0 lib/masticate/sniffer.rb
masticate-0.5.1 lib/masticate/sniffer.rb
masticate-0.5.0 lib/masticate/sniffer.rb
masticate-0.4.2 lib/masticate/sniffer.rb
masticate-0.4.1 lib/masticate/sniffer.rb
masticate-0.4.0 lib/masticate/sniffer.rb
masticate-0.3.2 lib/masticate/sniffer.rb
masticate-0.3.1 lib/masticate/sniffer.rb