lib/data_cleansing/cleaners.rb in data_cleansing-1.0.2 vs lib/data_cleansing/cleaners.rb in data_cleansing-1.0.3

- old
+ new

@@ -1,6 +1,6 @@ -require 'cgi' +require "cgi" module Cleaners # Strip leading and trailing whitespace module Strip def self.call(string) return string unless string.is_a?(String) @@ -35,11 +35,11 @@ NOT_WORDS = Regexp.compile(/\W/) def self.call(string) return string unless string.is_a?(String) - string.gsub!(NOT_WORDS, '') || string + string.gsub!(NOT_WORDS, "") || string end end DataCleansing.register_cleaner(:remove_non_word, RemoveNonWord) # Remove all not printable characters @@ -47,10 +47,14 @@ NOT_PRINTABLE = Regexp.compile(/[^[:print:]]/) def self.call(string) return string unless string.is_a?(String) + # Strip invalid characters, since they are non printable + unless string.valid_encoding? + string = string.encode(string.encoding, invalid: :replace, undef: :replace, replace: "") + end string.gsub!(NOT_PRINTABLE, '') || string end end DataCleansing.register_cleaner(:remove_non_printable, RemoveNonPrintable) @@ -61,22 +65,22 @@ def self.call(string) return string unless string.is_a?(String) string.gsub!(HTML_MARKUP) do |match| case match.downcase - when '&amp;' then - '&' - when '&quot;' then + when "&amp;" + "&" + when "&quot;" '"' - when '&gt;' then - '>' - when '&lt;' then - '<' - when '&apos;' then + when "&gt;" + ">" + when "&lt;" + "<" + when "&apos;" "'" - when '&nbsp;' then - ' ' + when "&nbsp;" + " " else "&#{match};" end end || string end @@ -106,11 +110,11 @@ WHITESPACE = Regexp.compile(/\s+/) def self.call(string) return string unless string.is_a?(String) - string.gsub!(WHITESPACE, ' ') || string + string.gsub!(WHITESPACE, " ") || string end end DataCleansing.register_cleaner(:compress_whitespace, CompressWhitespace) # Remove Non-Digit Chars @@ -119,40 +123,40 @@ DIGITS = Regexp.compile(/\D/) def self.call(string) return string unless string.is_a?(String) - string.gsub!(DIGITS, '') + string.gsub!(DIGITS, "") string.length > 0 ? string : nil end end DataCleansing.register_cleaner(:digits_only, DigitsOnly) # Returns [Integer] after removing all non-digit characters, except '.' # Returns nil if no digits are present in the string. module StringToInteger - NUMERIC = Regexp.compile(/[^0-9\.]/) + NUMERIC = Regexp.compile(/[^0-9.]/) def self.call(string) return string unless string.is_a?(String) # Remove Non-Digit Chars, except for '.' - string.gsub!(NUMERIC, '') + string.gsub!(NUMERIC, "") string.length > 0 ? string.to_i : nil end end DataCleansing.register_cleaner(:string_to_integer, StringToInteger) # Returns [Integer] after removing all non-digit characters, except '.' # Returns nil if no digits are present in the string. module StringToFloat - NUMERIC = Regexp.compile(/[^0-9\.]/) + NUMERIC = Regexp.compile(/[^0-9.]/) def self.call(string) return string unless string.is_a?(String) # Remove Non-Digit Chars, except for '.' - string.gsub!(NUMERIC, '') + string.gsub!(NUMERIC, "") string.length > 0 ? string.to_f : nil end end DataCleansing.register_cleaner(:string_to_float, StringToFloat)