# coding: utf-8
require 'nokogiri'
require 'addressable/uri'
require 'simpleidn'
module StringTools
module HTML
# минимальная длина строки, в которой могут быть ссылки
TEXT_WITH_LINKS_MINIMUM_LENGTH = 'yandex'
#
# StringTools::HTML.remove_links(html, whitelist: ['google.com'])
# # => 'yandex'
#
# StringTools::HTML.remove_links(html, whitelist: ['yandex.ru'])
# # => 'yandex'
#
# StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru'])
# # => 'yandex'
#
# html = 'yandex'
#
# StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru'])
# # => 'yandex'
#
# Returns String without links to external resources
def self.remove_links(html, options = {})
return html if html.length < TEXT_WITH_LINKS_MINIMUM_LENGTH
doc = Nokogiri::HTML::DocumentFragment.parse(html)
scrubber = LinksRemoveScrubber.new(options)
doc.css('a'.freeze).each { |node| scrubber.call node }
if scrubber.done_changes?
doc.children.map { |node| node.serialize HTML_SERIALIZE_OPTIONS }.join
else
html
end
end
class LinksRemoveScrubber
def initialize(options)
@whitelist = options.fetch(:whitelist)
@remove_without_host = options.fetch(:remove_without_host, true)
@is_have_done_changes = false
end
def done_changes?
@is_have_done_changes
end
def call(node)
href = node['href']
return if href.blank?
uri = Addressable::URI.parse(href).normalize
if !uri.host
replace_with_content node if @remove_without_host
elsif !whitelisted?(SimpleIDN.to_unicode(uri.host))
replace_with_content node
end
rescue Addressable::URI::InvalidURIError
replace_with_content node
end
def whitelisted?(domain)
host_parts = domain.split('.'.freeze)
host = host_parts[-1] # com, ru ...
(host_parts.length - 2).downto(0) do |i|
subdomain = host_parts[i]
host = "#{subdomain}.#{host}"
return true if @whitelist.include? host
end
false
end
private
def replace_with_content(node)
node.swap(node.children)
@is_have_done_changes = true
end
end
end
end