lib/twitter-text/validation.rb in twitter-text-2.1.0 vs lib/twitter-text/validation.rb in twitter-text-3.0.0

- old
+ new

@@ -1,5 +1,9 @@ +# Copyright 2018 Twitter, Inc. +# Licensed under the Apache License, Version 2.0 +# http://www.apache.org/licenses/LICENSE-2.0 + require 'unf' module Twitter module TwitterText module Validation extend self @@ -32,47 +36,65 @@ # Parse input text and return hash with descriptive parameters populated. def parse_tweet(text, options = {}) options = DEFAULT_TCO_URL_LENGTHS.merge(options) config = options[:config] || Twitter::TwitterText::Configuration.default_configuration normalized_text = text.to_nfc - normalized_text_length = normalized_text.char_length - unless (normalized_text_length > 0) + unless (normalized_text.length > 0) ParseResults.empty() end scale = config.scale max_weighted_tweet_length = config.max_weighted_tweet_length scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale transformed_url_length = config.transformed_url_length * scale ranges = config.ranges url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text) + emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : [] has_invalid_chars = false weighted_count = 0 offset = 0 display_offset = 0 valid_offset = 0 - while offset < normalized_text_length + while offset < normalized_text.codepoint_length # Reset the default char weight each pass through the loop char_weight = config.default_weight + entity_length = 0 + url_entities.each do |url_entity| if url_entity[:indices].first == offset - url_length = url_entity[:indices].last - url_entity[:indices].first + entity_length = url_entity[:indices].last - url_entity[:indices].first weighted_count += transformed_url_length - offset += url_length - display_offset += url_length + offset += entity_length + display_offset += entity_length if weighted_count <= scaled_max_weighted_tweet_length - valid_offset += url_length + valid_offset += entity_length end - # Finding a match breaks the loop; order of ranges matters. + # Finding a match breaks the loop break end end - if offset < normalized_text_length + emoji_entities.each do |emoji_entity| + if emoji_entity[:indices].first == offset + entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first + weighted_count += char_weight # the default weight + offset += entity_length + display_offset += entity_length + if weighted_count <= scaled_max_weighted_tweet_length + valid_offset += entity_length + end + # Finding a match breaks the loop + break + end + end + + next if entity_length > 0 + + if offset < normalized_text.codepoint_length code_point = normalized_text[offset] ranges.each do |range| if range.contains?(code_point.unpack("U").first) char_weight = range.weight @@ -80,20 +102,22 @@ end end weighted_count += char_weight - has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars - char_count = code_point.char_length - offset += char_count - display_offset += char_count + has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars + codepoint_length = code_point.codepoint_length + offset += codepoint_length + display_offset += codepoint_length + # index += codepoint_length if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length) - valid_offset += char_count + valid_offset += codepoint_length end end end - normalized_text_offset = text.char_length - normalized_text.char_length + + normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length scaled_weighted_length = weighted_count / scale is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))