lib/twitter-text/validation.rb in twitter-text-1.14.7 vs lib/twitter-text/validation.rb in twitter-text-2.0.0
- old
+ new
@@ -1,68 +1,117 @@
require 'unf'
module Twitter
module Validation extend self
- MAX_LENGTH = 140
-
DEFAULT_TCO_URL_LENGTHS = {
:short_url_length => 23,
- :short_url_length_https => 23,
- :characters_reserved_per_media => 23
- }.freeze
+ }
- # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
- # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
- # string no matter which actual form was transmitted. For example:
- #
- # U+0065 Latin Small Letter E
- # + U+0301 Combining Acute Accent
- # ----------
- # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
- # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
- #
- # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
- #
- def tweet_length(text, options = {})
+ # :weighted_length the weighted length of tweet based on weights specified in the config
+ # :valid If tweet is valid
+ # :permillage permillage of the tweet over the max length specified in config
+ # :valid_range_start beginning of valid text
+ # :valid_range_end End index of valid part of the tweet text (inclusive)
+ # :display_range_start beginning index of display text
+ # :display_range_end end index of display text (inclusive)
+ class ParseResults < Hash
+
+ RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
+
+ def self.empty
+ return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
+ end
+
+ def initialize(params = {})
+ RESULT_PARAMS.each do |key|
+ super[key] = params[key] if params.key?(key)
+ end
+ end
+ end
+
+ # Parse input text and return hash with descriptive parameters populated.
+ def parse_tweet(text, options = {})
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
+ config = options[:config] || Twitter::Configuration.default_configuration
+ normalized_text = text.to_nfc
+ normalized_text_length = normalized_text.char_length
+ unless (normalized_text_length > 0)
+ ParseResults.empty()
+ end
- length = text.to_nfc.unpack("U*").length
+ scale = config.scale
+ max_weighted_tweet_length = config.max_weighted_tweet_length
+ scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
+ transformed_url_length = config.transformed_url_length * scale
+ ranges = config.ranges
- Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
- length += start_position - end_position
- length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
+ url_entities = Twitter::Extractor.extract_urls_with_indices(normalized_text)
+
+ has_invalid_chars = false
+ weighted_count = 0
+ offset = 0
+ display_offset = 0
+ valid_offset = 0
+
+ while offset < normalized_text_length
+ # Reset the default char weight each pass through the loop
+ char_weight = config.default_weight
+ url_entities.each do |url_entity|
+ if url_entity[:indices].first == offset
+ url_length = url_entity[:indices].last - url_entity[:indices].first
+ weighted_count += transformed_url_length
+ offset += url_length
+ display_offset += url_length
+ if weighted_count <= scaled_max_weighted_tweet_length
+ valid_offset += url_length
+ end
+ # Finding a match breaks the loop; order of ranges matters.
+ break
+ end
+ end
+
+ if offset < normalized_text_length
+ code_point = normalized_text[offset]
+
+ ranges.each do |range|
+ if range.contains?(code_point.unpack("U").first)
+ char_weight = range.weight
+ break
+ end
+ end
+
+ weighted_count += char_weight
+
+ has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars
+ char_count = code_point.char_length
+ offset += char_count
+ display_offset += char_count
+
+ if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
+ valid_offset += char_count
+ end
+ end
end
+ normalized_text_offset = text.char_length - normalized_text.char_length
+ scaled_weighted_length = weighted_count / scale
+ is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
+ permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
- length
+ return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
end
- # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
- # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
- # will allow quicker feedback.
- #
- # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
- #
- # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
- # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
- # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
- def tweet_invalid?(text)
- return :empty if !text || text.empty?
+ def contains_invalid?(text)
+ return false if !text || text.empty?
begin
- return :too_long if tweet_length(text) > MAX_LENGTH
- return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
+ return true if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
rescue ArgumentError
# non-Unicode value.
- return :invalid_characters
+ return true
end
-
return false
end
- def valid_tweet_text?(text)
- !tweet_invalid?(text)
- end
-
def valid_username?(username)
return false if !username || username.empty?
extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
# Should extract the username minus the @ sign, hence the [1..-1]
@@ -99,9 +148,72 @@
valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
(!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
end
+
+ # These methods are deprecated, will be removed in future.
+ extend Deprecation
+
+ MAX_LENGTH_LEGACY = 140
+
+ # DEPRECATED: Please use parse_text instead.
+ #
+ # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
+ # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
+ # string no matter which actual form was transmitted. For example:
+ #
+ # U+0065 Latin Small Letter E
+ # + U+0301 Combining Acute Accent
+ # ----------
+ # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
+ # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
+ #
+ # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
+ #
+ def tweet_length(text, options = {})
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
+
+ length = text.to_nfc.unpack("U*").length
+
+ Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
+ length += start_position - end_position
+ length += options[:short_url_length] if url.length > 0
+ end
+
+ length
+ end
+ deprecate :tweet_length, :parse_tweet
+
+ # DEPRECATED: Please use parse_text instead.
+ #
+ # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
+ # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
+ # will allow quicker feedback.
+ #
+ # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
+ #
+ # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
+ # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
+ # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
+ def tweet_invalid?(text)
+ return :empty if !text || text.empty?
+ begin
+ return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
+ return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
+ rescue ArgumentError
+ # non-Unicode value.
+ return :invalid_characters
+ end
+
+ return false
+ end
+ deprecate :tweet_invalid?, :parse_tweet
+
+ def valid_tweet_text?(text)
+ !tweet_invalid?(text)
+ end
+ deprecate :valid_tweet_text?, :parse_tweet
private
def valid_match?(string, regex, optional=false)
return (string && string.match(regex) && $~.to_s == string) unless optional