lib/twitter-text/validation.rb in twitter-text-2.1.0 vs lib/twitter-text/validation.rb in twitter-text-3.0.0
- old
+ new
@@ -1,5 +1,9 @@
+# Copyright 2018 Twitter, Inc.
+# Licensed under the Apache License, Version 2.0
+# http://www.apache.org/licenses/LICENSE-2.0
+
require 'unf'
module Twitter
module TwitterText
module Validation extend self
@@ -32,47 +36,65 @@
# Parse input text and return hash with descriptive parameters populated.
def parse_tweet(text, options = {})
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
normalized_text = text.to_nfc
- normalized_text_length = normalized_text.char_length
- unless (normalized_text_length > 0)
+ unless (normalized_text.length > 0)
ParseResults.empty()
end
scale = config.scale
max_weighted_tweet_length = config.max_weighted_tweet_length
scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
transformed_url_length = config.transformed_url_length * scale
ranges = config.ranges
url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
+ emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []
has_invalid_chars = false
weighted_count = 0
offset = 0
display_offset = 0
valid_offset = 0
- while offset < normalized_text_length
+ while offset < normalized_text.codepoint_length
# Reset the default char weight each pass through the loop
char_weight = config.default_weight
+ entity_length = 0
+
url_entities.each do |url_entity|
if url_entity[:indices].first == offset
- url_length = url_entity[:indices].last - url_entity[:indices].first
+ entity_length = url_entity[:indices].last - url_entity[:indices].first
weighted_count += transformed_url_length
- offset += url_length
- display_offset += url_length
+ offset += entity_length
+ display_offset += entity_length
if weighted_count <= scaled_max_weighted_tweet_length
- valid_offset += url_length
+ valid_offset += entity_length
end
- # Finding a match breaks the loop; order of ranges matters.
+ # Finding a match breaks the loop
break
end
end
- if offset < normalized_text_length
+ emoji_entities.each do |emoji_entity|
+ if emoji_entity[:indices].first == offset
+ entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
+ weighted_count += char_weight # the default weight
+ offset += entity_length
+ display_offset += entity_length
+ if weighted_count <= scaled_max_weighted_tweet_length
+ valid_offset += entity_length
+ end
+ # Finding a match breaks the loop
+ break
+ end
+ end
+
+ next if entity_length > 0
+
+ if offset < normalized_text.codepoint_length
code_point = normalized_text[offset]
ranges.each do |range|
if range.contains?(code_point.unpack("U").first)
char_weight = range.weight
@@ -80,20 +102,22 @@
end
end
weighted_count += char_weight
- has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars
- char_count = code_point.char_length
- offset += char_count
- display_offset += char_count
+ has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
+ codepoint_length = code_point.codepoint_length
+ offset += codepoint_length
+ display_offset += codepoint_length
+ # index += codepoint_length
if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
- valid_offset += char_count
+ valid_offset += codepoint_length
end
end
end
- normalized_text_offset = text.char_length - normalized_text.char_length
+
+ normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
scaled_weighted_length = weighted_count / scale
is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))