lib/data_cleansing/cleaners.rb in data_cleansing-1.0.2 vs lib/data_cleansing/cleaners.rb in data_cleansing-1.0.3
- old
+ new
@@ -1,6 +1,6 @@
-require 'cgi'
+require "cgi"
module Cleaners
# Strip leading and trailing whitespace
module Strip
def self.call(string)
return string unless string.is_a?(String)
@@ -35,11 +35,11 @@
NOT_WORDS = Regexp.compile(/\W/)
def self.call(string)
return string unless string.is_a?(String)
- string.gsub!(NOT_WORDS, '') || string
+ string.gsub!(NOT_WORDS, "") || string
end
end
DataCleansing.register_cleaner(:remove_non_word, RemoveNonWord)
# Remove all not printable characters
@@ -47,10 +47,14 @@
NOT_PRINTABLE = Regexp.compile(/[^[:print:]]/)
def self.call(string)
return string unless string.is_a?(String)
+ # Strip invalid characters, since they are non printable
+ unless string.valid_encoding?
+ string = string.encode(string.encoding, invalid: :replace, undef: :replace, replace: "")
+ end
string.gsub!(NOT_PRINTABLE, '') || string
end
end
DataCleansing.register_cleaner(:remove_non_printable, RemoveNonPrintable)
@@ -61,22 +65,22 @@
def self.call(string)
return string unless string.is_a?(String)
string.gsub!(HTML_MARKUP) do |match|
case match.downcase
- when '&' then
- '&'
- when '"' then
+ when "&"
+ "&"
+ when """
'"'
- when '>' then
- '>'
- when '<' then
- '<'
- when ''' then
+ when ">"
+ ">"
+ when "<"
+ "<"
+ when "'"
"'"
- when ' ' then
- ' '
+ when " "
+ " "
else
"&#{match};"
end
end || string
end
@@ -106,11 +110,11 @@
WHITESPACE = Regexp.compile(/\s+/)
def self.call(string)
return string unless string.is_a?(String)
- string.gsub!(WHITESPACE, ' ') || string
+ string.gsub!(WHITESPACE, " ") || string
end
end
DataCleansing.register_cleaner(:compress_whitespace, CompressWhitespace)
# Remove Non-Digit Chars
@@ -119,40 +123,40 @@
DIGITS = Regexp.compile(/\D/)
def self.call(string)
return string unless string.is_a?(String)
- string.gsub!(DIGITS, '')
+ string.gsub!(DIGITS, "")
string.length > 0 ? string : nil
end
end
DataCleansing.register_cleaner(:digits_only, DigitsOnly)
# Returns [Integer] after removing all non-digit characters, except '.'
# Returns nil if no digits are present in the string.
module StringToInteger
- NUMERIC = Regexp.compile(/[^0-9\.]/)
+ NUMERIC = Regexp.compile(/[^0-9.]/)
def self.call(string)
return string unless string.is_a?(String)
# Remove Non-Digit Chars, except for '.'
- string.gsub!(NUMERIC, '')
+ string.gsub!(NUMERIC, "")
string.length > 0 ? string.to_i : nil
end
end
DataCleansing.register_cleaner(:string_to_integer, StringToInteger)
# Returns [Integer] after removing all non-digit characters, except '.'
# Returns nil if no digits are present in the string.
module StringToFloat
- NUMERIC = Regexp.compile(/[^0-9\.]/)
+ NUMERIC = Regexp.compile(/[^0-9.]/)
def self.call(string)
return string unless string.is_a?(String)
# Remove Non-Digit Chars, except for '.'
- string.gsub!(NUMERIC, '')
+ string.gsub!(NUMERIC, "")
string.length > 0 ? string.to_f : nil
end
end
DataCleansing.register_cleaner(:string_to_float, StringToFloat)