Sha256: 2d0083811e0339e877e9bd704f170c98e2cf07b636fd45ce5091de63ec833fbc

Contents?: true

Size: 1.47 KB

Versions: 1

Compression:

Stored size: 1.47 KB

Contents

# frozen_string_literal: true

module Dphil
  #
  # Base module for file converters (CSV, NEXUS, CollateX, etc.)
  #
  module Converter
    private

    # Load a file
    def load_file(infile)
      raise IOError, "File #{infile} not found." unless File.exist?(infile)
      File.read(infile)
    end

    # Load a CSV file
    def load_csv(infile, mode = "r")
      raise IOError, "File #{infile} not found." unless File.exist?(infile)
      CSV.read(infile, mode)
    end

    # Return a hash of array sorted/weighted by number of identical entries
    def weighted_uniq(array)
      weighted_hash = array.each_with_object({}) do |v, acc|
        acc[v] ||= 0
        acc[v] += 1
      end
      n = 0
      (weighted_hash.sort_by do |x|
        n += 1
        [-x[1], n]
      end).to_h
    end

    # Sanitize a character string to basic KH/ASCII
    def sanitize_char(str)
      str = str.to_s
      src = Sanscript.detect(str) || :iast
      str = Sanscript.transliterate(str, src, :kh)
      str.gsub!(/\s/, "_")
      str.tr!("'", "`")
      str.strip!
      str
    end

    # Tokenize the values of a character
    def tokenize(characters)
      char_set = weighted_uniq(characters.map { |c| sanitize_char(c) }.reject(&:empty?))
      char_set.each_with_object({}).with_index do |(char, acc), i|
        acc[char[0]] = [ALPHABET[i], char[1]]
      end
    end

    # NEX Token Alphabet
    ALPHABET = IceNine.deep_freeze(("A".."Z").to_a + ("a".."z").to_a)
    private_constant :ALPHABET
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
dphil-0.1.4 lib/dphil/converter.rb