Sha256: 9776102ed228319590e46d1fbda58bf2f92c7898cc26362fb47ada0c1d62fc2a

Contents?: true

Size: 1.92 KB

Versions: 3

Compression:

Stored size: 1.92 KB

Contents

module Muzzy
  # read file content and detect filetype is csv or tsv or others
  class FiletypeDetector
    attr_reader :filepath
    attr_reader :first_row, :second_row
    attr_reader :filetype

    def initialize(filepath)
      @filepath = filepath
    end

    def tsv?
      detect || @filetype == 'tsv'
    end

    def csv?
      detect || @filetype == 'csv'
    end

    def unknown?
      detect || @filetype == 'unknown'
    end

    private

    def tsv_ext?
      File.basename(@filepath) =~ /\.tsv\z/
    end

    def detect
      return unless @filetype.nil?

      if tsv_ext?
        @first_row, @second_row = Muzzy::Util.fetch_header_and_first_row(@filepath, "\t")
        @filetype = 'tsv'
        return
      end

      ## csv(,) or csv(\t) or something

      csv_header_row, csv_first_row = Muzzy::Util.fetch_header_and_first_row(@filepath, ",")
      tsv_header_row, tsv_first_row = Muzzy::Util.fetch_header_and_first_row(@filepath, "\t")
      if csv_header_row == -1 && tsv_header_row == -1
        @first_row, @second_row, @filetype = -1, -1, 'unknown'
        return
      end

      if csv_header_row == -1
        @first_row, @second_row, @filetype = tsv_header_row, tsv_first_row, 'tsv'
        return
      end
      if tsv_header_row == -1
        @first_row, @second_row, @filetype = csv_header_row, csv_first_row, 'csv'
        return
      end

      ## rare case

      if csv_header_row.length > tsv_header_row.length
        @first_row, @second_row, @filetype = csv_header_row, csv_first_row, 'csv'
        return
      else
        @first_row, @second_row, @filetype = tsv_header_row, tsv_first_row, 'tsv'
        return
      end

      if csv_header_row.length == 1 && tsv_first_row.length == 1
        # single col file treat as csv
        @first_row, @second_row, @filetype = csv_header_row, csv_first_row, 'csv'
        return
      end

      @first_row, @second_row, @filetype = -1, -1, 'unknown'
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
muzzy-0.1.14 lib/muzzy/filetype_detector.rb
muzzy-0.1.13 lib/muzzy/filetype_detector.rb
muzzy-0.1.12 lib/muzzy/filetype_detector.rb