# Copyright 2018 Twitter, Inc. # Licensed under the Apache License, Version 2.0 # http://www.apache.org/licenses/LICENSE-2.0 require 'unf' module Twitter module TwitterText module Validation extend self DEFAULT_TCO_URL_LENGTHS = { :short_url_length => 23, } # :weighted_length the weighted length of tweet based on weights specified in the config # :valid If tweet is valid # :permillage permillage of the tweet over the max length specified in config # :valid_range_start beginning of valid text # :valid_range_end End index of valid part of the tweet text (inclusive) # :display_range_start beginning index of display text # :display_range_end end index of display text (inclusive) class ParseResults < Hash RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end] def self.empty return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0) end def initialize(params = {}) RESULT_PARAMS.each do |key| super[key] = params[key] if params.key?(key) end end end # Parse input text and return hash with descriptive parameters populated. def parse_tweet(text, options = {}) options = DEFAULT_TCO_URL_LENGTHS.merge(options) config = options[:config] || Twitter::TwitterText::Configuration.default_configuration normalized_text = text.to_nfc unless (normalized_text.length > 0) ParseResults.empty() end scale = config.scale max_weighted_tweet_length = config.max_weighted_tweet_length scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale transformed_url_length = config.transformed_url_length * scale ranges = config.ranges url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text) emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : [] has_invalid_chars = false weighted_count = 0 offset = 0 display_offset = 0 valid_offset = 0 while offset < normalized_text.codepoint_length # Reset the default char weight each pass through the loop char_weight = config.default_weight entity_length = 0 url_entities.each do |url_entity| if url_entity[:indices].first == offset entity_length = url_entity[:indices].last - url_entity[:indices].first weighted_count += transformed_url_length offset += entity_length display_offset += entity_length if weighted_count <= scaled_max_weighted_tweet_length valid_offset += entity_length end # Finding a match breaks the loop break end end emoji_entities.each do |emoji_entity| if emoji_entity[:indices].first == offset entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first weighted_count += char_weight # the default weight offset += entity_length display_offset += entity_length if weighted_count <= scaled_max_weighted_tweet_length valid_offset += entity_length end # Finding a match breaks the loop break end end next if entity_length > 0 if offset < normalized_text.codepoint_length code_point = normalized_text[offset] ranges.each do |range| if range.contains?(code_point.unpack("U").first) char_weight = range.weight break end end weighted_count += char_weight has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars codepoint_length = code_point.codepoint_length offset += codepoint_length display_offset += codepoint_length # index += codepoint_length if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length) valid_offset += codepoint_length end end end normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length scaled_weighted_length = weighted_count / scale is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1)) end def contains_invalid?(text) return false if !text || text.empty? begin return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } rescue ArgumentError # non-Unicode value. return true end return false end def valid_username?(username) return false if !username || username.empty? extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username) # Should extract the username minus the @ sign, hence the [1..-1] extracted.size == 1 && extracted.first == username[1..-1] end VALID_LIST_RE = /\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o def valid_list?(username_list) match = username_list.match(VALID_LIST_RE) # Must have matched and had nothing before or after !!(match && match[1] == "" && match[4] && !match[4].empty?) end def valid_hashtag?(hashtag) return false if !hashtag || hashtag.empty? extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag) # Should extract the hashtag minus the # sign, hence the [1..-1] extracted.size == 1 && extracted.first == hashtag[1..-1] end def valid_url?(url, unicode_domains=true, require_protocol=true) return false if !url || url.empty? url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded]) return false unless (url_parts && url_parts.to_s == url) scheme, authority, path, query, fragment = url_parts.captures return false unless ((!require_protocol || (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) && valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) && valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) && valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true)) return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) || (!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority])) end # These methods are deprecated, will be removed in future. extend Deprecation MAX_LENGTH_LEGACY = 140 # DEPRECATED: Please use parse_text instead. # # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a # string no matter which actual form was transmitted. For example: # # U+0065 Latin Small Letter E # + U+0301 Combining Acute Accent # ---------- # = 2 bytes, 2 characters, displayed as é (1 visual glyph) # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1 # # The string could also contain U+00E9 already, in which case the canonicalization will not change the value. # def tweet_length(text, options = {}) options = DEFAULT_TCO_URL_LENGTHS.merge(options) length = text.to_nfc.unpack("U*").length Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position| length += start_position - end_position length += options[:short_url_length] if url.length > 0 end length end deprecate :tweet_length, :parse_tweet # DEPRECATED: Please use parse_text instead. # # Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation # will allow quicker feedback. # # Returns false if this text is valid. Otherwise one of the following Symbols will be returned: # # :too_long:: if the text is too long # :empty:: if the text is nil or empty # :invalid_characters:: if the text contains non-Unicode or any of the disallowed Unicode characters def tweet_invalid?(text) return :empty if !text || text.empty? begin return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } rescue ArgumentError # non-Unicode value. return :invalid_characters end return false end deprecate :tweet_invalid?, :parse_tweet def valid_tweet_text?(text) !tweet_invalid?(text) end deprecate :valid_tweet_text?, :parse_tweet private def valid_match?(string, regex, optional=false) return (string && string.match(regex) && $~.to_s == string) unless optional !(string && (!string.match(regex) || $~.to_s != string)) end end end end