lib/twitter-text/validation.rb in twitter-text-2.0.2 vs lib/twitter-text/validation.rb in twitter-text-2.1.0

- old
+ new

@@ -1,225 +1,227 @@ require 'unf' module Twitter - module Validation extend self - DEFAULT_TCO_URL_LENGTHS = { - :short_url_length => 23, - } + module TwitterText + module Validation extend self + DEFAULT_TCO_URL_LENGTHS = { + :short_url_length => 23, + } - # :weighted_length the weighted length of tweet based on weights specified in the config - # :valid If tweet is valid - # :permillage permillage of the tweet over the max length specified in config - # :valid_range_start beginning of valid text - # :valid_range_end End index of valid part of the tweet text (inclusive) - # :display_range_start beginning index of display text - # :display_range_end end index of display text (inclusive) - class ParseResults < Hash + # :weighted_length the weighted length of tweet based on weights specified in the config + # :valid If tweet is valid + # :permillage permillage of the tweet over the max length specified in config + # :valid_range_start beginning of valid text + # :valid_range_end End index of valid part of the tweet text (inclusive) + # :display_range_start beginning index of display text + # :display_range_end end index of display text (inclusive) + class ParseResults < Hash - RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end] + RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end] - def self.empty - return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0) - end + def self.empty + return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0) + end - def initialize(params = {}) - RESULT_PARAMS.each do |key| - super[key] = params[key] if params.key?(key) + def initialize(params = {}) + RESULT_PARAMS.each do |key| + super[key] = params[key] if params.key?(key) + end end end - end - # Parse input text and return hash with descriptive parameters populated. - def parse_tweet(text, options = {}) - options = DEFAULT_TCO_URL_LENGTHS.merge(options) - config = options[:config] || Twitter::Configuration.default_configuration - normalized_text = text.to_nfc - normalized_text_length = normalized_text.char_length - unless (normalized_text_length > 0) - ParseResults.empty() - end + # Parse input text and return hash with descriptive parameters populated. + def parse_tweet(text, options = {}) + options = DEFAULT_TCO_URL_LENGTHS.merge(options) + config = options[:config] || Twitter::TwitterText::Configuration.default_configuration + normalized_text = text.to_nfc + normalized_text_length = normalized_text.char_length + unless (normalized_text_length > 0) + ParseResults.empty() + end - scale = config.scale - max_weighted_tweet_length = config.max_weighted_tweet_length - scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale - transformed_url_length = config.transformed_url_length * scale - ranges = config.ranges + scale = config.scale + max_weighted_tweet_length = config.max_weighted_tweet_length + scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale + transformed_url_length = config.transformed_url_length * scale + ranges = config.ranges - url_entities = Twitter::Extractor.extract_urls_with_indices(normalized_text) + url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text) - has_invalid_chars = false - weighted_count = 0 - offset = 0 - display_offset = 0 - valid_offset = 0 + has_invalid_chars = false + weighted_count = 0 + offset = 0 + display_offset = 0 + valid_offset = 0 - while offset < normalized_text_length - # Reset the default char weight each pass through the loop - char_weight = config.default_weight - url_entities.each do |url_entity| - if url_entity[:indices].first == offset - url_length = url_entity[:indices].last - url_entity[:indices].first - weighted_count += transformed_url_length - offset += url_length - display_offset += url_length - if weighted_count <= scaled_max_weighted_tweet_length - valid_offset += url_length + while offset < normalized_text_length + # Reset the default char weight each pass through the loop + char_weight = config.default_weight + url_entities.each do |url_entity| + if url_entity[:indices].first == offset + url_length = url_entity[:indices].last - url_entity[:indices].first + weighted_count += transformed_url_length + offset += url_length + display_offset += url_length + if weighted_count <= scaled_max_weighted_tweet_length + valid_offset += url_length + end + # Finding a match breaks the loop; order of ranges matters. + break end - # Finding a match breaks the loop; order of ranges matters. - break end - end - if offset < normalized_text_length - code_point = normalized_text[offset] + if offset < normalized_text_length + code_point = normalized_text[offset] - ranges.each do |range| - if range.contains?(code_point.unpack("U").first) - char_weight = range.weight - break + ranges.each do |range| + if range.contains?(code_point.unpack("U").first) + char_weight = range.weight + break + end end - end - weighted_count += char_weight + weighted_count += char_weight - has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars - char_count = code_point.char_length - offset += char_count - display_offset += char_count + has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars + char_count = code_point.char_length + offset += char_count + display_offset += char_count - if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length) - valid_offset += char_count + if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length) + valid_offset += char_count + end end end + normalized_text_offset = text.char_length - normalized_text.char_length + scaled_weighted_length = weighted_count / scale + is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) + permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length + + return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1)) end - normalized_text_offset = text.char_length - normalized_text.char_length - scaled_weighted_length = weighted_count / scale - is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) - permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length - return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1)) - end - - def contains_invalid?(text) - return false if !text || text.empty? - begin - return true if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } - rescue ArgumentError - # non-Unicode value. - return true + def contains_invalid?(text) + return false if !text || text.empty? + begin + return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } + rescue ArgumentError + # non-Unicode value. + return true + end + return false end - return false - end - def valid_username?(username) - return false if !username || username.empty? + def valid_username?(username) + return false if !username || username.empty? - extracted = Twitter::Extractor.extract_mentioned_screen_names(username) - # Should extract the username minus the @ sign, hence the [1..-1] - extracted.size == 1 && extracted.first == username[1..-1] - end + extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username) + # Should extract the username minus the @ sign, hence the [1..-1] + extracted.size == 1 && extracted.first == username[1..-1] + end - VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o - def valid_list?(username_list) - match = username_list.match(VALID_LIST_RE) - # Must have matched and had nothing before or after - !!(match && match[1] == "" && match[4] && !match[4].empty?) - end + VALID_LIST_RE = /\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o + def valid_list?(username_list) + match = username_list.match(VALID_LIST_RE) + # Must have matched and had nothing before or after + !!(match && match[1] == "" && match[4] && !match[4].empty?) + end - def valid_hashtag?(hashtag) - return false if !hashtag || hashtag.empty? + def valid_hashtag?(hashtag) + return false if !hashtag || hashtag.empty? - extracted = Twitter::Extractor.extract_hashtags(hashtag) - # Should extract the hashtag minus the # sign, hence the [1..-1] - extracted.size == 1 && extracted.first == hashtag[1..-1] - end + extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag) + # Should extract the hashtag minus the # sign, hence the [1..-1] + extracted.size == 1 && extracted.first == hashtag[1..-1] + end - def valid_url?(url, unicode_domains=true, require_protocol=true) - return false if !url || url.empty? + def valid_url?(url, unicode_domains=true, require_protocol=true) + return false if !url || url.empty? - url_parts = url.match(Twitter::Regex[:validate_url_unencoded]) - return false unless (url_parts && url_parts.to_s == url) + url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded]) + return false unless (url_parts && url_parts.to_s == url) - scheme, authority, path, query, fragment = url_parts.captures + scheme, authority, path, query, fragment = url_parts.captures - return false unless ((!require_protocol || - (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) && - valid_match?(path, Twitter::Regex[:validate_url_path]) && - valid_match?(query, Twitter::Regex[:validate_url_query], true) && - valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true)) + return false unless ((!require_protocol || + (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) && + valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) && + valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) && + valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true)) - return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) || - (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority])) - end + return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) || + (!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority])) + end - # These methods are deprecated, will be removed in future. - extend Deprecation + # These methods are deprecated, will be removed in future. + extend Deprecation - MAX_LENGTH_LEGACY = 140 + MAX_LENGTH_LEGACY = 140 - # DEPRECATED: Please use parse_text instead. - # - # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC - # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a - # string no matter which actual form was transmitted. For example: - # - # U+0065 Latin Small Letter E - # + U+0301 Combining Acute Accent - # ---------- - # = 2 bytes, 2 characters, displayed as é (1 visual glyph) - # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1 - # - # The string could also contain U+00E9 already, in which case the canonicalization will not change the value. - # - def tweet_length(text, options = {}) - options = DEFAULT_TCO_URL_LENGTHS.merge(options) + # DEPRECATED: Please use parse_text instead. + # + # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC + # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a + # string no matter which actual form was transmitted. For example: + # + # U+0065 Latin Small Letter E + # + U+0301 Combining Acute Accent + # ---------- + # = 2 bytes, 2 characters, displayed as é (1 visual glyph) + # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1 + # + # The string could also contain U+00E9 already, in which case the canonicalization will not change the value. + # + def tweet_length(text, options = {}) + options = DEFAULT_TCO_URL_LENGTHS.merge(options) - length = text.to_nfc.unpack("U*").length + length = text.to_nfc.unpack("U*").length - Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position| - length += start_position - end_position - length += options[:short_url_length] if url.length > 0 + Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position| + length += start_position - end_position + length += options[:short_url_length] if url.length > 0 + end + + length end + deprecate :tweet_length, :parse_tweet - length - end - deprecate :tweet_length, :parse_tweet + # DEPRECATED: Please use parse_text instead. + # + # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation + # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation + # will allow quicker feedback. + # + # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned: + # + # <tt>:too_long</tt>:: if the <tt>text</tt> is too long + # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty + # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters + def tweet_invalid?(text) + return :empty if !text || text.empty? + begin + return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY + return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } + rescue ArgumentError + # non-Unicode value. + return :invalid_characters + end - # DEPRECATED: Please use parse_text instead. - # - # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation - # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation - # will allow quicker feedback. - # - # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned: - # - # <tt>:too_long</tt>:: if the <tt>text</tt> is too long - # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty - # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters - def tweet_invalid?(text) - return :empty if !text || text.empty? - begin - return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY - return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } - rescue ArgumentError - # non-Unicode value. - return :invalid_characters + return false end + deprecate :tweet_invalid?, :parse_tweet - return false - end - deprecate :tweet_invalid?, :parse_tweet + def valid_tweet_text?(text) + !tweet_invalid?(text) + end + deprecate :valid_tweet_text?, :parse_tweet - def valid_tweet_text?(text) - !tweet_invalid?(text) - end - deprecate :valid_tweet_text?, :parse_tweet + private - private + def valid_match?(string, regex, optional=false) + return (string && string.match(regex) && $~.to_s == string) unless optional - def valid_match?(string, regex, optional=false) - return (string && string.match(regex) && $~.to_s == string) unless optional - - !(string && (!string.match(regex) || $~.to_s != string)) + !(string && (!string.match(regex) || $~.to_s != string)) + end end end end