lib/twitter-text/validation.rb in twitter-text-1.14.7 vs lib/twitter-text/validation.rb in twitter-text-2.0.0

- old
+ new

@@ -1,68 +1,117 @@ require 'unf' module Twitter module Validation extend self - MAX_LENGTH = 140 - DEFAULT_TCO_URL_LENGTHS = { :short_url_length => 23, - :short_url_length_https => 23, - :characters_reserved_per_media => 23 - }.freeze + } - # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC - # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a - # string no matter which actual form was transmitted. For example: - # - # U+0065 Latin Small Letter E - # + U+0301 Combining Acute Accent - # ---------- - # = 2 bytes, 2 characters, displayed as é (1 visual glyph) - # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1 - # - # The string could also contain U+00E9 already, in which case the canonicalization will not change the value. - # - def tweet_length(text, options = {}) + # :weighted_length the weighted length of tweet based on weights specified in the config + # :valid If tweet is valid + # :permillage permillage of the tweet over the max length specified in config + # :valid_range_start beginning of valid text + # :valid_range_end End index of valid part of the tweet text (inclusive) + # :display_range_start beginning index of display text + # :display_range_end end index of display text (inclusive) + class ParseResults < Hash + + RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end] + + def self.empty + return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0) + end + + def initialize(params = {}) + RESULT_PARAMS.each do |key| + super[key] = params[key] if params.key?(key) + end + end + end + + # Parse input text and return hash with descriptive parameters populated. + def parse_tweet(text, options = {}) options = DEFAULT_TCO_URL_LENGTHS.merge(options) + config = options[:config] || Twitter::Configuration.default_configuration + normalized_text = text.to_nfc + normalized_text_length = normalized_text.char_length + unless (normalized_text_length > 0) + ParseResults.empty() + end - length = text.to_nfc.unpack("U*").length + scale = config.scale + max_weighted_tweet_length = config.max_weighted_tweet_length + scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale + transformed_url_length = config.transformed_url_length * scale + ranges = config.ranges - Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position| - length += start_position - end_position - length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length] + url_entities = Twitter::Extractor.extract_urls_with_indices(normalized_text) + + has_invalid_chars = false + weighted_count = 0 + offset = 0 + display_offset = 0 + valid_offset = 0 + + while offset < normalized_text_length + # Reset the default char weight each pass through the loop + char_weight = config.default_weight + url_entities.each do |url_entity| + if url_entity[:indices].first == offset + url_length = url_entity[:indices].last - url_entity[:indices].first + weighted_count += transformed_url_length + offset += url_length + display_offset += url_length + if weighted_count <= scaled_max_weighted_tweet_length + valid_offset += url_length + end + # Finding a match breaks the loop; order of ranges matters. + break + end + end + + if offset < normalized_text_length + code_point = normalized_text[offset] + + ranges.each do |range| + if range.contains?(code_point.unpack("U").first) + char_weight = range.weight + break + end + end + + weighted_count += char_weight + + has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars + char_count = code_point.char_length + offset += char_count + display_offset += char_count + + if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length) + valid_offset += char_count + end + end end + normalized_text_offset = text.char_length - normalized_text.char_length + scaled_weighted_length = weighted_count / scale + is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) + permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length - length + return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1)) end - # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation - # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation - # will allow quicker feedback. - # - # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned: - # - # <tt>:too_long</tt>:: if the <tt>text</tt> is too long - # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty - # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters - def tweet_invalid?(text) - return :empty if !text || text.empty? + def contains_invalid?(text) + return false if !text || text.empty? begin - return :too_long if tweet_length(text) > MAX_LENGTH - return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } + return true if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } rescue ArgumentError # non-Unicode value. - return :invalid_characters + return true end - return false end - def valid_tweet_text?(text) - !tweet_invalid?(text) - end - def valid_username?(username) return false if !username || username.empty? extracted = Twitter::Extractor.extract_mentioned_screen_names(username) # Should extract the username minus the @ sign, hence the [1..-1] @@ -99,9 +148,72 @@ valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true)) return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) || (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority])) end + + # These methods are deprecated, will be removed in future. + extend Deprecation + + MAX_LENGTH_LEGACY = 140 + + # DEPRECATED: Please use parse_text instead. + # + # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC + # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a + # string no matter which actual form was transmitted. For example: + # + # U+0065 Latin Small Letter E + # + U+0301 Combining Acute Accent + # ---------- + # = 2 bytes, 2 characters, displayed as é (1 visual glyph) + # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1 + # + # The string could also contain U+00E9 already, in which case the canonicalization will not change the value. + # + def tweet_length(text, options = {}) + options = DEFAULT_TCO_URL_LENGTHS.merge(options) + + length = text.to_nfc.unpack("U*").length + + Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position| + length += start_position - end_position + length += options[:short_url_length] if url.length > 0 + end + + length + end + deprecate :tweet_length, :parse_tweet + + # DEPRECATED: Please use parse_text instead. + # + # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation + # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation + # will allow quicker feedback. + # + # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned: + # + # <tt>:too_long</tt>:: if the <tt>text</tt> is too long + # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty + # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters + def tweet_invalid?(text) + return :empty if !text || text.empty? + begin + return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY + return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } + rescue ArgumentError + # non-Unicode value. + return :invalid_characters + end + + return false + end + deprecate :tweet_invalid?, :parse_tweet + + def valid_tweet_text?(text) + !tweet_invalid?(text) + end + deprecate :valid_tweet_text?, :parse_tweet private def valid_match?(string, regex, optional=false) return (string && string.match(regex) && $~.to_s == string) unless optional