# = HTML Filter
#
# HTML Filter library can be used to sanitize and sterilize
# HTML. A good idea if you let users submit HTML in comments,
# for instance.
#
# HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson
#
# This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
# http://creativecommons.org/licenses/by-sa/2.5/
#
# Thanks to Jang Kim for adding support for single quoted attributes.
#
# == Reference
#
# * http://iamcal.com/publish/articles/php/processing_html/
# * http://iamcal.com/publish/articles/php/processing_html_part_2/
#
# == Author(s)
#
# * TransNoumena
# * George Moschovitis
# * James Britt
# * Cal Henderson
# * Jang Kim
#
# == Authors
#
# * Trans
#
# == Copying
#
# Copyright (c) 2007 7rans
require 'facets/multiton'
# = HtmlFilter
#
# HTML Filter library can be used to sanitize and sterilize
# HTML. A good idea if you let users submit HTML in comments,
# for instance.
#
# lib_filter.php, v1.15 by Cal Henderson
#
# This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
# http://creativecommons.org/licenses/by-sa/2.5/
#
# Thanks to Jang Kim for adding support for single quoted attributes.
#
# == Reference
#
# * http://iamcal.com/publish/articles/php/processing_html/
# * http://iamcal.com/publish/articles/php/processing_html_part_2/
class HtmlFilter
include Multiton
# tags and attributes that are allowed
#
# Eg.
#
# {
# 'a' => ['href', 'target'],
# 'b' => [],
# 'img' => ['src', 'width', 'height', 'alt']
# }
attr_accessor :allowed
# tags which should always be self-closing (e.g. "")
attr_accessor :no_close
# tags which must always have seperate opening and closing
# tags (e.g. "")
attr_accessor :always_close
# attributes which should be checked for valid protocols
# (src,href)
attr_accessor :protocol_attributes
# protocols which are allowed (http, ftp, mailto)
attr_accessor :allowed_protocols
# tags which should be removed if they contain no content
# (e.g. "" or "")
attr_accessor :remove_blanks
# should we remove comments? (true, false)
attr_accessor :strip_comments
# should we try and make a b tag out of "b>" (true, false)
attr_accessor :always_make_tags
# entity control option (true, false)
attr_accessor :allow_numbered_entities
# entity control option (amp, gt, lt, quot, etc.)
attr_accessor :allowed_entities
# default settings
DEFAULT = {
'allowed' => {
'a' => ['href', 'target'],
'b' => [],
'i' => [],
'img' => ['src', 'width', 'height', 'alt']
},
'no_close' => ['img', 'br', 'hr'],
'always_close' => ['a', 'b'],
'protocol_attributes' => ['src', 'href'],
'allowed_protocols' => ['http', 'ftp', 'mailto'],
'remove_blanks' => ['a', 'b'],
'strip_comments' => true,
'always_make_tags' => true,
'allow_numbered_entities' => true,
'allowed_entities' => ['amp', 'gt', 'lt', 'quot']
}
# New html filter.
def initialize( options=nil )
if options
h = DEFAULT.dup
options.each do |k,v|
h[k.to_s] = v
end
options = h
else
options = DEFAULT.dup
end
options.each{ |k,v| send("#{k}=",v) }
end
# Filter html string.
def filter(data)
@tag_counts = {}
data = escape_comments(data)
data = balance_html(data)
data = check_tags(data)
data = process_remove_blanks(data)
data = validate_entities(data)
return data
end
private
#
# internal tag counter
#
attr_reader :tag_counts
#
#
#
def escape_comments(data)
data = data.gsub(//s) do
''
end
return data
end
#
#
#
def balance_html(data)
data = data.dup
if always_make_tags
# try and form html
data.gsub!(/>>+/, '>')
data.gsub!(/<<+/, '<')
data.gsub!(/^>/, '')
data.gsub!(/<([^>]*?)(?=<|$)/, '<\1>')
data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1<\2')
else
# escape stray brackets
data.gsub!(/<([^>]*?)(?=<|$)/, '<\1')
data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1\2><')
# the last regexp causes '<>' entities to appear
# (we need to do a lookahead assertion so that the last bracket
# can be used in the next pass of the regexp)
data.gsub!('<>', '')
end
return data
end
#
#
#
def check_tags(data)
data = data.dup
data.gsub!(/<(.*?)>/s){
process_tag(strip_single($1))
}
tag_counts.each do |tag, cnt|
cnt.times{ data << "#{tag}>" }
end
return data
end
#
#
#
def process_tag(data)
# ending tags
re = /^\/([a-z0-9]+)/si
if matches = re.match(data)
name = matches[1].downcase
if allowed.key?(name)
unless no_close.include?(name)
if tag_counts[name]
tag_counts[name] -= 1
return "#{name}>"
end
end
else
return ''
end
end
# starting tags
re = /^([a-z0-9]+)(.*?)(\/?)$/si
if matches = re.match(data)
name = matches[1].downcase
body = matches[2]
ending = matches[3]
if allowed.key?(name)
params = ""
matches_2 = body.scan(/([a-z0-9]+)=(["'])(.*?)\2/si) #
matches_1 = body.scan(/([a-z0-9]+)(=)([^"\s']+)/si) #
matches_3 = body.scan(/([a-z0-9]+)=(["'])([^"']*?)\s*$/si) # '
else
return ''
end
end
# comments
if /^!--(.*)--$/si =~ data
if strip_comments
return ''
else
return '<' + data + '>'
end
end
# garbage, ignore it
return ''
end
#
#
#
def process_param_protocol(data)
data = decode_entities(data)
re = /^([^:]+)\:/si
if matches = re.match(data)
unless allowed_protocols.include?(matches[1])
#data = '#'.substr(data, strlen(matches[1])+1)
data = '#' + data[0..matches[1].size+1]
end
end
return data
end
#
#
#
def process_remove_blanks(data)
data = data.dup
remove_blanks.each do |tag|
data.gsub!(/<#{tag}(\s[^>]*)?><\/#{tag}>/, '')
data.gsub!(/<#{tag}(\s[^>]*)?\/>/, '')
end
return data
end
#
#
#
def fix_case(data)
data_notags = strip_tags(data)
data_notags = data_notags.gsub(/[^a-zA-Z]/, '')
if data_notags.size < 5
return data
end
if /[a-z]/ =~ data_notags
return data
end
data = data.gsub(/(>|^)([^<]+?)(<|$)/s){
strip_single($1) +
fix_case_inner(strip_single($2)) +
strip_single($3)
}
return data
end
#
#
#
def fix_case_inner(data)
data = data.dup
data.downcase!
data.gsub!(/(^|[^\w\s\';,\\-])(\s*)([a-z])/){
strip_single("#{$1}#{$2}") + strip_single($3).upcase
}
return data
end
#
#
#
def validate_entities(data)
data = data.dup
# validate entities throughout the string
data.gsub!(%r!&([^&;]*)(?=(;|&|$))!){
check_entity(strip_single($1), strip_single($2))
}
# validate quotes outside of tags
data.gsub!(/(>|^)([^<]+?)(<|$)/s){
m1, m2, m3 = $1, $2, $3
strip_single(m1) +
strip_single(m2).gsub('\"', '"') +
strip_single(m3)
}
return data
end
#
#
#
def check_entity(preamble, term)
if term != ';'
return '&' + preamble
end
if is_valid_entity(preamble)
return '&' + preamble
end
return '&' + preamble
end
#
#
#
def is_valid_entity(entity)
re = /^#([0-9]+)$/i
if md = re.match(entity)
if (md[1].to_i > 127)
return true
end
return allow_numbered_entities
end
if allowed_entities.include?(entity)
return true
end
return nil
end
# within attributes, we want to convert all hex/dec/url
# escape sequences into their raw characters so that we can
# check we don't get stray quotes/brackets inside strings.
def decode_entities(data)
data = data.dup
data.gsub!(/(&)#(\d+);?/){ decode_dec_entity($1, $2) }
data.gsub!(/(&)#x([0-9a-f]+);?/i){ decode_hex_entity($1, $2) }
data.gsub!(/(%)([0-9a-f]{2});?/i){ decode_hex_entity($1, $2) }
data = validate_entities(data)
return data
end
#
#
#
def decode_hex_entity(*m)
return decode_num_entity(m[1], m[2].to_i.to_s(16))
end
#
#
#
def decode_dec_entity(*m)
return decode_num_entity(m[1], m[2])
end
#
#
#
def decode_num_entity(orig_type, d)
d = d.to_i
d = 32 if d < 0 # space
# don't mess with high chars
if d > 127
return '%' + d.to_s(16) if orig_type == '%'
return "{d};" if orig_type == '&'
end
return escape_special_chars(d.chr)
end
#
#
#
def strip_single(data)
return data.gsub('\"', '"').gsub('\0', 0.chr)
end
# Certain characters have special significance in HTML, and
# should be represented by HTML entities if they are to
# preserve their meanings. This function returns a string
# with some of these conversions made; the translations made
# are those most useful for everyday web programming.
def escape_special_chars(data)
data = data.dup
data.gsub! /&/n , '&'
data.gsub! /\"/n , '"'
data.gsub! />/n , '>'
data.gsub! / false )
assert_equal( h1.object_id, h2.object_id )
assert_not_equal( h1.object_id, h3.object_id )
end
def test_multiton_with_options
h1 = HtmlFilter.new( :strip_comments => false )
h2 = HtmlFilter.new( :strip_comments => false )
h3 = HtmlFilter.new
assert_equal( h1.object_id, h2.object_id )
assert_not_equal( h1.object_id, h3.object_id )
end
def test_strip_single
hf = HtmlFilter.new
assert_equal( '"', hf.send(:strip_single,'\"') )
assert_equal( "\000", hf.send(:strip_single,'\0') )
end
# functional tests
def assert_filter(filtered, original)
assert_equal(filtered, original.html_filter)
end
def test_fix_quotes
assert_filter '', ""
end
def test_basics
assert_filter '', ''
assert_filter 'hello', 'hello'
end
def test_balancing_tags
assert_filter "hello", "<hello"
assert_filter "hello", ">hello"
assert_filter "hello", "hello<"
assert_filter "hello", "hello>"
assert_filter "", "<>"
end
def test_tag_completion
assert_filter "hello", "hello"
assert_filter "hello", "hello"
assert_filter "helloworld", "helloworld"
assert_filter "hello", "hello"
assert_filter "hello", "hello"
assert_filter "helloworld", "helloworld"
assert_filter "hello", "hello"
assert_filter "", ""
end
def test_end_slashes
assert_filter '', ''
assert_filter '', ''
assert_filter '', ''
end
end
=end