lib/slicing.rb in slicing-0.1.0.pre vs lib/slicing.rb in slicing-0.1.0
- old
+ new
@@ -1,64 +1,124 @@
require "slicing/version"
+require 'digest/md5'
require 'thor'
require 'csv'
module Slicing
class Base < Thor
check_unknown_options!
package_name 'slicing'
- default_task :hello
+ default_task :help
- desc :rm, ""
+ desc :sample, "create a sample output"
+ def sample path, output_path, size
+ file_csv = CSV.read(path,:headers=> true, :encoding => "ISO8859-1:utf-8")
+ sample = file_csv.sample(size)
+ CSV.open(output_path, "a+") do |csv|
+ sample.each do |value|
+ csv << value
+ end
+ end
+ end
+
+ desc :freq, "calculate item frequencies"
+ def freq path, column_name, output_path
+ file_to_count = "./#{path}.csv"
+ output = "./#{path}-counted.csv"
+ file_to_count_csv = CSV.read(file_to_count,:headers=> true, :encoding => "ISO8859-1:utf-8")
+ unique_nric_array = file_to_count_csv[column_name]
+ unique_nric = []
+ unique_nric_array.each_with_index do |value, index|
+ unique_nric.push(value) if index !=0
+ end
+
+ final_hash = score(unique_nric)
+ CSV.open(output, "a+") do |csv|
+ final_hash.each do |value|
+ csv << [value[0], value[1]]
+ end
+ end
+ end
+
+
+ desc :mask, "mask a particular column"
+ def mask path, column_name, output_path
+ original = CSV.read(path, { headers: true, return_headers: true, :encoding => "ISO8859-1:utf-8"})
+ CSV.open(output_path, 'a+') do |csv|
+ original.each do |row|
+ csv << array
+ end
+ end
+ end
+
+ desc :rm, "remove a column"
+ method_option :utf, type: :string, aliases: '-u', default: "ISO8859-1:utf-8"
+ method_option :headers, type: :boolean, aliases: '-h', default: true
+ method_option :rowsep, type: :string, aliases: '-r', default: nil
def rm path, column_name, output
- data = CSV.read(path, :headers=> false, :encoding => "ISO8859-1:utf-8") #2014
- data.delete(column_name)
- CSV.open(output,"a+") do |csv|
- data.each_with_index do |row,index|
+ # headers, rowsep, utf = process_options(options[:headers], options[:rowsep], options[:utf])
+ if options[:rowsep] != nil
+ original = CSV.read(path, { headers: options[:headers], return_headers: options[:headers], :row_sep=> options[:rowsep], :encoding => options[:utf]})
+ else
+ original = CSV.read(path, { headers: options[:headers], return_headers: options[:headers], :encoding => options[:utf]})
+ end
+ original.delete(column_name)
+ CSV.open(output, 'a+') do |csv|
+ original.each do |row|
csv << row
end
end
end
-
- desc :first, ""
+ desc :first, "display the first numbers of line"
+ method_option :line, type: :numeric, aliases: '-l', default: 100
def first csv_file #, value=100
- stop = 100
+ stop = options[:line]
counter = 0
CSV.foreach(csv_file, :headers => false, encoding: "ISO8859-1:utf-8") do |row|
exit if counter == stop
begin
counter = counter + 1
puts row
rescue
end
-
end
end
- desc :head, ""
+ desc :head, "show the headers"
def head csv_file
CSV.foreach(csv_file, :headers => false, encoding: "ISO8859-1:utf-8") do |row|
puts row
puts "----"
puts "#{row.count} columns"
exit
end
end
+ desc :unique, "calculate number of unique values in column"
+ def unique path, column_name
+ data = CSV.read(path, :headers => true, return_headers: true, encoding: "ISO8859-1:utf-8")
+ array = data[column_name]
+ puts array.uniq.count if array != nil
+ end
- desc :count, ""
+
+ desc :count, "count the number of rows and columns"
def count csv_file
- data = CSV.read(csv_file)
- puts "#{data.count} rows"
+ data = CSV.read(csv_file, :headers => false, encoding: "ISO8859-1:utf-8")
+ puts "#{data.count} rows #{data[0].count} columns"
+ puts "---"
+ puts "#{data[0]}"
end
- desc :subset, ""
- def subset csv_file, output, value=10
+ desc :subset, "create a subset of the data"
+ method_option :line, type: :numeric, aliases: '-l', default: 1000
+ def subset(csv_file, output)
path = csv_file
output_directory = output #"/Users/ytbryan/Desktop/output/subset-2015.csv" #output directory
- stop = value
+ # options[:num] == nil ? (stop = 10) : (stop = options[:num])
+ stop = options[:line]
counter = 0
CSV.foreach(path, :headers => false, encoding: "ISO8859-1:utf-8") do |row|
exit if counter == stop
begin
counter = counter + 1
@@ -67,8 +127,48 @@
end
rescue
end
end
end
+
+ # desc :subsetagain, ""
+ # def subsetagain csv_file, output, value=10
+ # path = csv_file
+ # output_directory = output #"/Users/ytbryan/Desktop/output/subset-2015.csv" #output directory
+ # stop = value
+ # counter = 0
+ # CSV.foreach(path, :headers => false, :row_sep => "\r\n", encoding: "ISO8859-1:utf-8") do |row|
+ # exit if counter == stop
+ # begin
+ # counter = counter + 1
+ # CSV.open(output_directory, "a+") do |csv|
+ # csv << row
+ # end
+ # rescue
+ # end
+ # end
+ # end
+
+ private
+
+ def process_options headers, rowsep, utf
+ if headers == nil
+ headers = true
+ else
+ headers = headers
+ end
+ return true, "\r\n" , "ISO8859-1:utf-8"
+ end
+
+ def masking(value)
+ value != nil ? answer = Digest::MD5.hexdigest(value) : answer
+ end
+
+ def score( array )
+ hash = Hash.new(0)
+ array.each{|key| hash[key] += 1}
+ hash
+ end
+
end
end