lib/twitter-text/validation.rb in twitter-text-2.0.2 vs lib/twitter-text/validation.rb in twitter-text-2.1.0
- old
+ new
@@ -1,225 +1,227 @@
require 'unf'
module Twitter
- module Validation extend self
- DEFAULT_TCO_URL_LENGTHS = {
- :short_url_length => 23,
- }
+ module TwitterText
+ module Validation extend self
+ DEFAULT_TCO_URL_LENGTHS = {
+ :short_url_length => 23,
+ }
- # :weighted_length the weighted length of tweet based on weights specified in the config
- # :valid If tweet is valid
- # :permillage permillage of the tweet over the max length specified in config
- # :valid_range_start beginning of valid text
- # :valid_range_end End index of valid part of the tweet text (inclusive)
- # :display_range_start beginning index of display text
- # :display_range_end end index of display text (inclusive)
- class ParseResults < Hash
+ # :weighted_length the weighted length of tweet based on weights specified in the config
+ # :valid If tweet is valid
+ # :permillage permillage of the tweet over the max length specified in config
+ # :valid_range_start beginning of valid text
+ # :valid_range_end End index of valid part of the tweet text (inclusive)
+ # :display_range_start beginning index of display text
+ # :display_range_end end index of display text (inclusive)
+ class ParseResults < Hash
- RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
+ RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
- def self.empty
- return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
- end
+ def self.empty
+ return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
+ end
- def initialize(params = {})
- RESULT_PARAMS.each do |key|
- super[key] = params[key] if params.key?(key)
+ def initialize(params = {})
+ RESULT_PARAMS.each do |key|
+ super[key] = params[key] if params.key?(key)
+ end
end
end
- end
- # Parse input text and return hash with descriptive parameters populated.
- def parse_tweet(text, options = {})
- options = DEFAULT_TCO_URL_LENGTHS.merge(options)
- config = options[:config] || Twitter::Configuration.default_configuration
- normalized_text = text.to_nfc
- normalized_text_length = normalized_text.char_length
- unless (normalized_text_length > 0)
- ParseResults.empty()
- end
+ # Parse input text and return hash with descriptive parameters populated.
+ def parse_tweet(text, options = {})
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
+ config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
+ normalized_text = text.to_nfc
+ normalized_text_length = normalized_text.char_length
+ unless (normalized_text_length > 0)
+ ParseResults.empty()
+ end
- scale = config.scale
- max_weighted_tweet_length = config.max_weighted_tweet_length
- scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
- transformed_url_length = config.transformed_url_length * scale
- ranges = config.ranges
+ scale = config.scale
+ max_weighted_tweet_length = config.max_weighted_tweet_length
+ scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
+ transformed_url_length = config.transformed_url_length * scale
+ ranges = config.ranges
- url_entities = Twitter::Extractor.extract_urls_with_indices(normalized_text)
+ url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
- has_invalid_chars = false
- weighted_count = 0
- offset = 0
- display_offset = 0
- valid_offset = 0
+ has_invalid_chars = false
+ weighted_count = 0
+ offset = 0
+ display_offset = 0
+ valid_offset = 0
- while offset < normalized_text_length
- # Reset the default char weight each pass through the loop
- char_weight = config.default_weight
- url_entities.each do |url_entity|
- if url_entity[:indices].first == offset
- url_length = url_entity[:indices].last - url_entity[:indices].first
- weighted_count += transformed_url_length
- offset += url_length
- display_offset += url_length
- if weighted_count <= scaled_max_weighted_tweet_length
- valid_offset += url_length
+ while offset < normalized_text_length
+ # Reset the default char weight each pass through the loop
+ char_weight = config.default_weight
+ url_entities.each do |url_entity|
+ if url_entity[:indices].first == offset
+ url_length = url_entity[:indices].last - url_entity[:indices].first
+ weighted_count += transformed_url_length
+ offset += url_length
+ display_offset += url_length
+ if weighted_count <= scaled_max_weighted_tweet_length
+ valid_offset += url_length
+ end
+ # Finding a match breaks the loop; order of ranges matters.
+ break
end
- # Finding a match breaks the loop; order of ranges matters.
- break
end
- end
- if offset < normalized_text_length
- code_point = normalized_text[offset]
+ if offset < normalized_text_length
+ code_point = normalized_text[offset]
- ranges.each do |range|
- if range.contains?(code_point.unpack("U").first)
- char_weight = range.weight
- break
+ ranges.each do |range|
+ if range.contains?(code_point.unpack("U").first)
+ char_weight = range.weight
+ break
+ end
end
- end
- weighted_count += char_weight
+ weighted_count += char_weight
- has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars
- char_count = code_point.char_length
- offset += char_count
- display_offset += char_count
+ has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars
+ char_count = code_point.char_length
+ offset += char_count
+ display_offset += char_count
- if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
- valid_offset += char_count
+ if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
+ valid_offset += char_count
+ end
end
end
+ normalized_text_offset = text.char_length - normalized_text.char_length
+ scaled_weighted_length = weighted_count / scale
+ is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
+ permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
+
+ return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
end
- normalized_text_offset = text.char_length - normalized_text.char_length
- scaled_weighted_length = weighted_count / scale
- is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
- permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
- return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
- end
-
- def contains_invalid?(text)
- return false if !text || text.empty?
- begin
- return true if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
- rescue ArgumentError
- # non-Unicode value.
- return true
+ def contains_invalid?(text)
+ return false if !text || text.empty?
+ begin
+ return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
+ rescue ArgumentError
+ # non-Unicode value.
+ return true
+ end
+ return false
end
- return false
- end
- def valid_username?(username)
- return false if !username || username.empty?
+ def valid_username?(username)
+ return false if !username || username.empty?
- extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
- # Should extract the username minus the @ sign, hence the [1..-1]
- extracted.size == 1 && extracted.first == username[1..-1]
- end
+ extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username)
+ # Should extract the username minus the @ sign, hence the [1..-1]
+ extracted.size == 1 && extracted.first == username[1..-1]
+ end
- VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
- def valid_list?(username_list)
- match = username_list.match(VALID_LIST_RE)
- # Must have matched and had nothing before or after
- !!(match && match[1] == "" && match[4] && !match[4].empty?)
- end
+ VALID_LIST_RE = /\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o
+ def valid_list?(username_list)
+ match = username_list.match(VALID_LIST_RE)
+ # Must have matched and had nothing before or after
+ !!(match && match[1] == "" && match[4] && !match[4].empty?)
+ end
- def valid_hashtag?(hashtag)
- return false if !hashtag || hashtag.empty?
+ def valid_hashtag?(hashtag)
+ return false if !hashtag || hashtag.empty?
- extracted = Twitter::Extractor.extract_hashtags(hashtag)
- # Should extract the hashtag minus the # sign, hence the [1..-1]
- extracted.size == 1 && extracted.first == hashtag[1..-1]
- end
+ extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag)
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
+ end
- def valid_url?(url, unicode_domains=true, require_protocol=true)
- return false if !url || url.empty?
+ def valid_url?(url, unicode_domains=true, require_protocol=true)
+ return false if !url || url.empty?
- url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
- return false unless (url_parts && url_parts.to_s == url)
+ url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded])
+ return false unless (url_parts && url_parts.to_s == url)
- scheme, authority, path, query, fragment = url_parts.captures
+ scheme, authority, path, query, fragment = url_parts.captures
- return false unless ((!require_protocol ||
- (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
- valid_match?(path, Twitter::Regex[:validate_url_path]) &&
- valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
- valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
+ return false unless ((!require_protocol ||
+ (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
+ valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) &&
+ valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) &&
+ valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true))
- return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
- (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
- end
+ return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) ||
+ (!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority]))
+ end
- # These methods are deprecated, will be removed in future.
- extend Deprecation
+ # These methods are deprecated, will be removed in future.
+ extend Deprecation
- MAX_LENGTH_LEGACY = 140
+ MAX_LENGTH_LEGACY = 140
- # DEPRECATED: Please use parse_text instead.
- #
- # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
- # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
- # string no matter which actual form was transmitted. For example:
- #
- # U+0065 Latin Small Letter E
- # + U+0301 Combining Acute Accent
- # ----------
- # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
- # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
- #
- # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
- #
- def tweet_length(text, options = {})
- options = DEFAULT_TCO_URL_LENGTHS.merge(options)
+ # DEPRECATED: Please use parse_text instead.
+ #
+ # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
+ # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
+ # string no matter which actual form was transmitted. For example:
+ #
+ # U+0065 Latin Small Letter E
+ # + U+0301 Combining Acute Accent
+ # ----------
+ # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
+ # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
+ #
+ # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
+ #
+ def tweet_length(text, options = {})
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
- length = text.to_nfc.unpack("U*").length
+ length = text.to_nfc.unpack("U*").length
- Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
- length += start_position - end_position
- length += options[:short_url_length] if url.length > 0
+ Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
+ length += start_position - end_position
+ length += options[:short_url_length] if url.length > 0
+ end
+
+ length
end
+ deprecate :tweet_length, :parse_tweet
- length
- end
- deprecate :tweet_length, :parse_tweet
+ # DEPRECATED: Please use parse_text instead.
+ #
+ # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
+ # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
+ # will allow quicker feedback.
+ #
+ # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
+ #
+ # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
+ # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
+ # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
+ def tweet_invalid?(text)
+ return :empty if !text || text.empty?
+ begin
+ return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
+ return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
+ rescue ArgumentError
+ # non-Unicode value.
+ return :invalid_characters
+ end
- # DEPRECATED: Please use parse_text instead.
- #
- # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
- # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
- # will allow quicker feedback.
- #
- # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
- #
- # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
- # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
- # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
- def tweet_invalid?(text)
- return :empty if !text || text.empty?
- begin
- return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
- return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
- rescue ArgumentError
- # non-Unicode value.
- return :invalid_characters
+ return false
end
+ deprecate :tweet_invalid?, :parse_tweet
- return false
- end
- deprecate :tweet_invalid?, :parse_tweet
+ def valid_tweet_text?(text)
+ !tweet_invalid?(text)
+ end
+ deprecate :valid_tweet_text?, :parse_tweet
- def valid_tweet_text?(text)
- !tweet_invalid?(text)
- end
- deprecate :valid_tweet_text?, :parse_tweet
+ private
- private
+ def valid_match?(string, regex, optional=false)
+ return (string && string.match(regex) && $~.to_s == string) unless optional
- def valid_match?(string, regex, optional=false)
- return (string && string.match(regex) && $~.to_s == string) unless optional
-
- !(string && (!string.match(regex) || $~.to_s != string))
+ !(string && (!string.match(regex) || $~.to_s != string))
+ end
end
end
end