lib/govspeak.rb in govspeak-6.2.1 vs lib/govspeak.rb in govspeak-6.3.0
- old
+ new
@@ -1,13 +1,16 @@
require 'active_support/core_ext/hash'
require 'active_support/core_ext/array'
require 'erb'
+require 'govuk_publishing_components'
require 'htmlentities'
require 'kramdown'
require 'kramdown/parser/govuk'
+require 'nokogiri'
+require 'nokogumbo'
require 'rinku'
-require 'govuk_publishing_components'
+require 'sanitize'
require 'govspeak/header_extractor'
require 'govspeak/structured_header_extractor'
require 'govspeak/html_validator'
require 'govspeak/html_sanitizer'
require 'govspeak/kramdown_overrides'
@@ -101,15 +104,22 @@
@source.scan(regex).map(&:first).uniq.select { |id| id.match(UUID_REGEX) }
end
def preprocess(source)
source = Govspeak::BlockquoteExtraQuoteRemover.remove(source)
+ source = remove_forbidden_characters(source)
self.class.extensions.each do |_, regexp, block|
source.gsub!(regexp) {
instance_exec(*Regexp.last_match.captures, &block)
}
end
source
+ end
+
+ def remove_forbidden_characters(source)
+ # These are characters that are not deemed not suitable for
+ # markup: https://www.w3.org/TR/unicode-xml/#Charlist
+ source.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '')
end
def self.extension(title, regexp = nil, &block)
regexp ||= %r${::#{title}}(.*?){:/#{title}}$m
@extensions << [title, regexp, block]