lib/onebox/helpers.rb in onebox-1.9.24 vs lib/onebox/helpers.rb in onebox-1.9.25

- old
+ new

@@ -1,12 +1,16 @@ # frozen_string_literal: true +require "addressable" + module Onebox module Helpers class DownloadTooLarge < StandardError; end + IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com'] + def self.symbolize_keys(hash) return {} if hash.nil? hash.inject({}) do |result, (key, value)| new_key = key.is_a?(String) ? key.to_sym : key @@ -21,16 +25,19 @@ end def self.fetch_html_doc(url, headers = nil) response = (fetch_response(url, nil, nil, headers) rescue nil) doc = Nokogiri::HTML(response) + uri = URI(url) - ignore_canonical = doc.at('meta[property="og:ignore_canonical"]') - unless ignore_canonical && ignore_canonical['content'].to_s == 'true' + ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]') + should_ignore_canonical = IGNORE_CANONICAL_DOMAINS.map { |hostname| uri.hostname.match?(hostname) }.any? + + unless (ignore_canonical_tag && ignore_canonical_tag['content'].to_s == 'true') || should_ignore_canonical # prefer canonical link canonical_link = doc.at('//link[@rel="canonical"]/@href') - if canonical_link && "#{URI(canonical_link).host}#{URI(canonical_link).path}" != "#{URI(url).host}#{URI(url).path}" + if canonical_link && "#{URI(canonical_link).host}#{URI(canonical_link).path}" != "#{uri.host}#{uri.path}" response = (fetch_response(canonical_link, nil, nil, headers) rescue nil) doc = Nokogiri::HTML(response) if response end end @@ -186,59 +193,28 @@ end end src end - RFC_3986_URI_REGEX ||= /^(?<scheme>([^:\/?#]+):)?(?<authority>\/\/([^\/?#]*))?(?<path>[^?#]*)(\?(?<query>[^#]*))?(#(?<fragment>.*))?$/ - DOUBLE_ESCAPED_REGEXP ||= /%25([0-9a-f]{2})/i - - # Percent-encodes a URI query parameter per RFC3986 - https://tools.ietf.org/html/rfc3986 - def self.uri_query_encode(query_string) - return "" unless query_string - - # query can encode space to %20 OR + - # + MUST be encoded as %2B - # in RFC3968 both query and fragment are defined as: - # = *( pchar / "/" / "?" ) - # CGI.escape turns space into + which is the most backward compatible - # however it doesn't roundtrip through URI.unescape which prefers %20 - CGI.escape(query_string).gsub('%25', '%').gsub('+', '%20') - end - # Percent-encodes a URI string per RFC3986 - https://tools.ietf.org/html/rfc3986 def self.uri_encode(url) return "" unless url - # parse uri into named matches, then reassemble properly encoded - parts = url.match(RFC_3986_URI_REGEX) + uri = Addressable::URI.parse(url) - encoded = "" - encoded += parts[:scheme] unless parts[:scheme].nil? - encoded += parts[:authority] unless parts[:authority].nil? + encoded_uri = Addressable::URI.new( + scheme: Addressable::URI.encode_component(uri.scheme, Addressable::URI::CharacterClasses::SCHEME), + authority: Addressable::URI.encode_component(uri.authority, Addressable::URI::CharacterClasses::AUTHORITY), + path: Addressable::URI.encode_component(uri.path, Addressable::URI::CharacterClasses::PATH + "\\%"), + query: Addressable::URI.encode_component(uri.query, "a-zA-Z0-9\\-\\.\\_\\~\\$\\&\\*\\,\\=\\:\\@\\?\\%"), + fragment: Addressable::URI.encode_component(uri.fragment, "a-zA-Z0-9\\-\\.\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:\\/\\?\\%") + ) - # path requires space to be encoded as %20 (NEVER +) - # + should be left unencoded - # URI::parse and URI::Generic.build don't like paths encoded with CGI.escape - # URI.escape does not change / to %2F and : to %3A like CGI.escape - encoded += URI.escape(parts[:path]) unless parts[:path].nil? - encoded.gsub!(DOUBLE_ESCAPED_REGEXP, '%\1') + encoded_uri.to_s + end - # each query parameter - if !parts[:query].nil? - query_string = parts[:query].split('&').map do |pair| - # can optionally be separated by an = - pair.split('=').map do |v| - uri_query_encode(v) - end.join('=') - end.join('&') - encoded += '?' + query_string - end - - unless parts[:fragment].nil? - encoded += '#' + uri_query_encode(parts[:fragment])&.gsub('%21%2F', '!/') - end - - encoded + def self.uri_unencode(url) + Addressable::URI.unencode(url) end def self.video_placeholder_html "<div class='onebox-placeholder-container'><span class='placeholder-icon video'></span></div>" end