lib/onebox/helpers.rb in onebox-1.9.24 vs lib/onebox/helpers.rb in onebox-1.9.25
- old
+ new
@@ -1,12 +1,16 @@
# frozen_string_literal: true
+require "addressable"
+
module Onebox
module Helpers
class DownloadTooLarge < StandardError; end
+ IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com']
+
def self.symbolize_keys(hash)
return {} if hash.nil?
hash.inject({}) do |result, (key, value)|
new_key = key.is_a?(String) ? key.to_sym : key
@@ -21,16 +25,19 @@
end
def self.fetch_html_doc(url, headers = nil)
response = (fetch_response(url, nil, nil, headers) rescue nil)
doc = Nokogiri::HTML(response)
+ uri = URI(url)
- ignore_canonical = doc.at('meta[property="og:ignore_canonical"]')
- unless ignore_canonical && ignore_canonical['content'].to_s == 'true'
+ ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]')
+ should_ignore_canonical = IGNORE_CANONICAL_DOMAINS.map { |hostname| uri.hostname.match?(hostname) }.any?
+
+ unless (ignore_canonical_tag && ignore_canonical_tag['content'].to_s == 'true') || should_ignore_canonical
# prefer canonical link
canonical_link = doc.at('//link[@rel="canonical"]/@href')
- if canonical_link && "#{URI(canonical_link).host}#{URI(canonical_link).path}" != "#{URI(url).host}#{URI(url).path}"
+ if canonical_link && "#{URI(canonical_link).host}#{URI(canonical_link).path}" != "#{uri.host}#{uri.path}"
response = (fetch_response(canonical_link, nil, nil, headers) rescue nil)
doc = Nokogiri::HTML(response) if response
end
end
@@ -186,59 +193,28 @@
end
end
src
end
- RFC_3986_URI_REGEX ||= /^(?<scheme>([^:\/?#]+):)?(?<authority>\/\/([^\/?#]*))?(?<path>[^?#]*)(\?(?<query>[^#]*))?(#(?<fragment>.*))?$/
- DOUBLE_ESCAPED_REGEXP ||= /%25([0-9a-f]{2})/i
-
- # Percent-encodes a URI query parameter per RFC3986 - https://tools.ietf.org/html/rfc3986
- def self.uri_query_encode(query_string)
- return "" unless query_string
-
- # query can encode space to %20 OR +
- # + MUST be encoded as %2B
- # in RFC3968 both query and fragment are defined as:
- # = *( pchar / "/" / "?" )
- # CGI.escape turns space into + which is the most backward compatible
- # however it doesn't roundtrip through URI.unescape which prefers %20
- CGI.escape(query_string).gsub('%25', '%').gsub('+', '%20')
- end
-
# Percent-encodes a URI string per RFC3986 - https://tools.ietf.org/html/rfc3986
def self.uri_encode(url)
return "" unless url
- # parse uri into named matches, then reassemble properly encoded
- parts = url.match(RFC_3986_URI_REGEX)
+ uri = Addressable::URI.parse(url)
- encoded = ""
- encoded += parts[:scheme] unless parts[:scheme].nil?
- encoded += parts[:authority] unless parts[:authority].nil?
+ encoded_uri = Addressable::URI.new(
+ scheme: Addressable::URI.encode_component(uri.scheme, Addressable::URI::CharacterClasses::SCHEME),
+ authority: Addressable::URI.encode_component(uri.authority, Addressable::URI::CharacterClasses::AUTHORITY),
+ path: Addressable::URI.encode_component(uri.path, Addressable::URI::CharacterClasses::PATH + "\\%"),
+ query: Addressable::URI.encode_component(uri.query, "a-zA-Z0-9\\-\\.\\_\\~\\$\\&\\*\\,\\=\\:\\@\\?\\%"),
+ fragment: Addressable::URI.encode_component(uri.fragment, "a-zA-Z0-9\\-\\.\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:\\/\\?\\%")
+ )
- # path requires space to be encoded as %20 (NEVER +)
- # + should be left unencoded
- # URI::parse and URI::Generic.build don't like paths encoded with CGI.escape
- # URI.escape does not change / to %2F and : to %3A like CGI.escape
- encoded += URI.escape(parts[:path]) unless parts[:path].nil?
- encoded.gsub!(DOUBLE_ESCAPED_REGEXP, '%\1')
+ encoded_uri.to_s
+ end
- # each query parameter
- if !parts[:query].nil?
- query_string = parts[:query].split('&').map do |pair|
- # can optionally be separated by an =
- pair.split('=').map do |v|
- uri_query_encode(v)
- end.join('=')
- end.join('&')
- encoded += '?' + query_string
- end
-
- unless parts[:fragment].nil?
- encoded += '#' + uri_query_encode(parts[:fragment])&.gsub('%21%2F', '!/')
- end
-
- encoded
+ def self.uri_unencode(url)
+ Addressable::URI.unencode(url)
end
def self.video_placeholder_html
"<div class='onebox-placeholder-container'><span class='placeholder-icon video'></span></div>"
end