# -*- encoding : utf-8 -*- require 'sanitize' class String def to_plain_text_preserving_links( ) ::Sanitize.clean( self, :elements=>['a'], :attributes=>{'a'=>['href']} ).to_s.strip end def to_plain_text() ::Sanitize.clean( HTMLEntities.new.decode(self) ).to_s.strip end def remove_whitespace self.gsub(" ","") end def is_i? !!(self =~ /^[-+]?[0-9]([0-9]*)?$/) end # # How many words are in this string # Includes duplicates # def word_count #re = /[\p{Word}\p{Punct}]/u # <-- tried several forms of this regex, this appears to be the only one that works correctly with Cyrillic AND Arabic script #re = /\s+/u #to_utf8.scan(re).size split.inject(0) do |sum, word| if word.contains_cjk? sum += word.scan(cjk_regex).size # => ONLY work in Ruby 1.9. # Search for other methods to do this for 1.8 else sum += 1 end end end def char_count trim.size end def cjk_regex /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/ end def contains_cjk? !!(self =~ /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/) end # # Removes starting, trailing whitespace and double spaces # def trim self.gsub(/^(.*[^\s])\s+$/, '\1').gsub(/^\s*(.*)$/, '\1') end def to_utf16le Iconv.conv('utf-16le', 'UTF-8', self) end def to_active_record_condition "%#{self.trim.gsub(/[[:space:]]+/, '%')}%" end # need to force encoding for ruby 1.9 otherwise regex fails when comparing string of 2 different encodings # TODO : String.blank? - do we need to do force encoding? is UTF-8 a good default? def blank? begin if Gem::Version.new(''+RUBY_VERSION) >= Gem::Version.new("1.9.0") # only for ruby 1.9+ self.dup.force_encoding("UTF-8") !~ /\S/ else self !~ /\S/ end rescue ArgumentError => e if e.message =~ /^(invalid\ byte\ sequence|incompatible\ character\ encodings)/ empty? else raise e end end end # converts the encoding to UTF-8 regardless of current encoding def to_utf8 text = self.dup # shiny new ruby 1.9 way return text if text.encoding.name == "UTF-8" && text.valid_encoding? # already utf-8 yay! encodings = [ "UTF-8", "ISO-8859-1", "UTF-16BE", "UTF-16LE", "UTF-32BE", "UTF-32LE", "Windows-1251", "UTF-7", "US-ASCII", "ASCII-8BIT" ] encodings.each do |encoding| if (text.force_encoding(encoding).valid_encoding? rescue false) return text.force_encoding(encoding).encode("UTF-8") end end # ok so we are out of suggestions. Just return the string and hope that its ok text end end