validation.rb in twitter-text-2.0.0

- old
+ new

@@ -1,68 +1,117 @@
 require 'unf'
 
 module Twitter
   module Validation extend self
-    MAX_LENGTH = 140
-
     DEFAULT_TCO_URL_LENGTHS = {
       :short_url_length => 23,
-      :short_url_length_https => 23,
-      :characters_reserved_per_media => 23
-    }.freeze
+    }
 
-    # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
-    # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
-    # string no matter which actual form was transmitted. For example:
-    #
-    #     U+0065  Latin Small Letter E
-    # +   U+0301  Combining Acute Accent
-    # ----------
-    # =   2 bytes, 2 characters, displayed as é (1 visual glyph)
-    #     … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
-    #
-    # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
-    #
-    def tweet_length(text, options = {})
+    # :weighted_length the weighted length of tweet based on weights specified in the config
+    # :valid If tweet is valid
+    # :permillage permillage of the tweet over the max length specified in config
+    # :valid_range_start beginning of valid text
+    # :valid_range_end End index of valid part of the tweet text (inclusive)
+    # :display_range_start beginning index of display text
+    # :display_range_end end index of display text (inclusive)
+    class ParseResults < Hash
+
+      RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
+
+      def self.empty
+        return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
+      end
+
+      def initialize(params = {})
+        RESULT_PARAMS.each do |key|
+          super[key] = params[key] if params.key?(key)
+        end
+      end
+    end
+
+    # Parse input text and return hash with descriptive parameters populated.
+    def parse_tweet(text, options = {})
       options = DEFAULT_TCO_URL_LENGTHS.merge(options)
+      config = options[:config] || Twitter::Configuration.default_configuration
+      normalized_text = text.to_nfc
+      normalized_text_length = normalized_text.char_length
+      unless (normalized_text_length > 0)
+        ParseResults.empty()
+      end
 
-      length = text.to_nfc.unpack("U*").length
+      scale = config.scale
+      max_weighted_tweet_length = config.max_weighted_tweet_length
+      scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
+      transformed_url_length = config.transformed_url_length * scale
+      ranges = config.ranges
 
-      Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
-        length += start_position - end_position
-        length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
+      url_entities = Twitter::Extractor.extract_urls_with_indices(normalized_text)
+
+      has_invalid_chars = false
+      weighted_count = 0
+      offset = 0
+      display_offset = 0
+      valid_offset = 0
+
+      while offset < normalized_text_length
+        # Reset the default char weight each pass through the loop
+        char_weight = config.default_weight
+        url_entities.each do |url_entity|
+          if url_entity[:indices].first == offset
+            url_length = url_entity[:indices].last - url_entity[:indices].first
+            weighted_count += transformed_url_length
+            offset += url_length
+            display_offset += url_length
+            if weighted_count <= scaled_max_weighted_tweet_length
+              valid_offset += url_length
+            end
+            # Finding a match breaks the loop; order of ranges matters.
+            break
+          end
+        end
+
+        if offset < normalized_text_length
+          code_point = normalized_text[offset]
+
+          ranges.each do |range|
+            if range.contains?(code_point.unpack("U").first)
+              char_weight = range.weight
+              break
+            end
+          end
+
+          weighted_count += char_weight
+
+          has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars
+          char_count = code_point.char_length
+          offset += char_count
+          display_offset += char_count
+
+          if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
+            valid_offset += char_count
+          end
+        end
       end
+      normalized_text_offset = text.char_length - normalized_text.char_length
+      scaled_weighted_length = weighted_count / scale
+      is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
+      permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
 
-      length
+      return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
     end
 
-    # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
-    # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
-    # will allow quicker feedback.
-    #
-    # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
-    #
-    #   <tt>:too_long</tt>:: if the <tt>text</tt> is too long
-    #   <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
-    #   <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
-    def tweet_invalid?(text)
-      return :empty if !text || text.empty?
+    def contains_invalid?(text)
+      return false if !text || text.empty?
       begin
-        return :too_long if tweet_length(text) > MAX_LENGTH
-        return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
+        return true if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
       rescue ArgumentError
         # non-Unicode value.
-        return :invalid_characters
+        return true
       end
-
       return false
     end
 
-    def valid_tweet_text?(text)
-      !tweet_invalid?(text)
-    end
-
     def valid_username?(username)
       return false if !username || username.empty?
 
       extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
       # Should extract the username minus the @ sign, hence the [1..-1]
@@ -99,9 +148,72 @@
                            valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
 
       return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
              (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
     end
+
+    # These methods are deprecated, will be removed in future.
+    extend Deprecation
+
+    MAX_LENGTH_LEGACY = 140
+
+    # DEPRECATED: Please use parse_text instead.
+    #
+    # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
+    # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
+    # string no matter which actual form was transmitted. For example:
+    #
+    #     U+0065  Latin Small Letter E
+    # +   U+0301  Combining Acute Accent
+    # ----------
+    # =   2 bytes, 2 characters, displayed as é (1 visual glyph)
+    #     … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
+    #
+    # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
+    #
+    def tweet_length(text, options = {})
+      options = DEFAULT_TCO_URL_LENGTHS.merge(options)
+
+      length = text.to_nfc.unpack("U*").length
+
+      Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
+        length += start_position - end_position
+        length += options[:short_url_length] if url.length > 0
+      end
+
+      length
+    end
+    deprecate :tweet_length, :parse_tweet
+
+    # DEPRECATED: Please use parse_text instead.
+    #
+    # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
+    # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
+    # will allow quicker feedback.
+    #
+    # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
+    #
+    #   <tt>:too_long</tt>:: if the <tt>text</tt> is too long
+    #   <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
+    #   <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
+    def tweet_invalid?(text)
+      return :empty if !text || text.empty?
+      begin
+        return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
+        return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
+      rescue ArgumentError
+        # non-Unicode value.
+        return :invalid_characters
+      end
+
+      return false
+    end
+    deprecate :tweet_invalid?, :parse_tweet
+
+    def valid_tweet_text?(text)
+      !tweet_invalid?(text)
+    end
+    deprecate :valid_tweet_text?, :parse_tweet
 
     private
 
     def valid_match?(string, regex, optional=false)
       return (string && string.match(regex) && $~.to_s == string) unless optional