lib/data_cleansing/cleaners.rb in data_cleansing-0.9.0 vs lib/data_cleansing/cleaners.rb in data_cleansing-1.0.0
- old
+ new
@@ -1,6 +1,6 @@
-require 'uri'
+require 'cgi'
module Cleaners
# Strip leading and trailing whitespace
module Strip
def self.call(string)
return string unless string.is_a?(String)
@@ -18,10 +18,20 @@
string.upcase! || string
end
end
DataCleansing.register_cleaner(:upcase, Upcase)
+ # Convert to downcase
+ module Downcase
+ def self.call(string)
+ return string unless string.is_a?(String)
+
+ string.downcase! || string
+ end
+ end
+ DataCleansing.register_cleaner(:downcase, Downcase)
+
# Remove all non-word characters, including whitespace
module RemoveNonWord
NOT_WORDS = Regexp.compile(/\W/)
def self.call(string)
@@ -42,11 +52,11 @@
string.gsub!(NOT_PRINTABLE, '') || string
end
end
DataCleansing.register_cleaner(:remove_non_printable, RemoveNonPrintable)
- # Remove HTML Markup
+ # Unescape HTML Markup ( case-insensitive )
module ReplaceHTMLMarkup
HTML_MARKUP = Regexp.compile(/&(amp|quot|gt|lt|apos|nbsp);/in)
def self.call(string)
return string unless string.is_a?(String)
@@ -75,19 +85,19 @@
module UnescapeURI
def self.call(string)
return string unless string.is_a?(String)
- URI.unescape(string)
+ CGI.unescape(string)
end
end
DataCleansing.register_cleaner(:unescape_uri, UnescapeURI)
module EscapeURI
def self.call(string)
return string unless string.is_a?(String)
- URI.escape(string)
+ CGI.escape(string)
end
end
DataCleansing.register_cleaner(:escape_uri, EscapeURI)
# Compress multiple whitespace to a single space