module Twitter
module Validation
MAX_LENGTH = 140
# Character not allowed in Tweets
INVALID_CHARACTERS = [
0xFFFE, 0xFEFF, # BOM
0xFFFF, # Special
0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
].map{|cp| [cp].pack('U') }.freeze
# Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
# (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
# string no matter which actual form was transmitted. For example:
#
# U+0065 Latin Small Letter E
# + U+0301 Combining Acute Accent
# ----------
# = 2 bytes, 2 characters, displayed as é (1 visual glyph)
# … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
#
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
#
def tweet_length(text)
ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
end
# Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
# before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
# will allow quicker feedback.
#
# Returns false if this text is valid. Otherwise one of the following Symbols will be returned:
#
# :too_long:: if the text is too long
# :empty:: if the text is nil or empty
# :invalid_characters:: if the text contains non-Unicode or any of the disallowed Unicode characters
def tweet_invalid?(text)
return :empty if text.blank?
begin
return :too_long if tweet_length(text) > MAX_LENGTH
return :invalid_characters if INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
rescue ArgumentError, ActiveSupport::Multibyte::EncodingError => e
# non-Unicode value.
return :invalid_characters
end
return false
end
end
end