lib/govspeak.rb in govspeak-6.2.1 vs lib/govspeak.rb in govspeak-6.3.0

- old
+ new

@@ -1,13 +1,16 @@ require 'active_support/core_ext/hash' require 'active_support/core_ext/array' require 'erb' +require 'govuk_publishing_components' require 'htmlentities' require 'kramdown' require 'kramdown/parser/govuk' +require 'nokogiri' +require 'nokogumbo' require 'rinku' -require 'govuk_publishing_components' +require 'sanitize' require 'govspeak/header_extractor' require 'govspeak/structured_header_extractor' require 'govspeak/html_validator' require 'govspeak/html_sanitizer' require 'govspeak/kramdown_overrides' @@ -101,15 +104,22 @@ @source.scan(regex).map(&:first).uniq.select { |id| id.match(UUID_REGEX) } end def preprocess(source) source = Govspeak::BlockquoteExtraQuoteRemover.remove(source) + source = remove_forbidden_characters(source) self.class.extensions.each do |_, regexp, block| source.gsub!(regexp) { instance_exec(*Regexp.last_match.captures, &block) } end source + end + + def remove_forbidden_characters(source) + # These are characters that are not deemed not suitable for + # markup: https://www.w3.org/TR/unicode-xml/#Charlist + source.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '') end def self.extension(title, regexp = nil, &block) regexp ||= %r${::#{title}}(.*?){:/#{title}}$m @extensions << [title, regexp, block]