require "yui/compressor" module HtmlCompressor class Compressor JS_COMPRESSOR_YUI = "yui"; JS_COMPRESSOR_CLOSURE = "closure"; # Predefined pattern that matches <?php ... ?> tags. # Could be passed inside a list to {@link #setPreservePatterns(List) setPreservePatterns} method. PHP_TAG_PATTERN = /<\?php.*?\?>/im # Predefined pattern that matches <% ... %> tags. # Could be passed inside a list to {@link #setPreservePatterns(List) setPreservePatterns} method. SERVER_SCRIPT_TAG_PATTERN = /<%.*?%>/m # Predefined pattern that matches <--# ... --> tags. # Could be passed inside a list to {@link #setPreservePatterns(List) setPreservePatterns} method. SERVER_SIDE_INCLUDE_PATTERN = //m # Predefined list of tags that are very likely to be block-level. #Could be passed to {@link #setRemoveSurroundingSpaces(String) setRemoveSurroundingSpaces} method. BLOCK_TAGS_MIN = "html,head,body,br,p" # Predefined list of tags that are block-level by default, excluding <div> and <li> tags. #Table tags are also included. #Could be passed to {@link #setRemoveSurroundingSpaces(String) setRemoveSurroundingSpaces} method. BLOCK_TAGS_MAX = BLOCK_TAGS_MIN + ",h1,h2,h3,h4,h5,h6,blockquote,center,dl,fieldset,form,frame,frameset,hr,noframes,ol,table,tbody,tr,td,th,tfoot,thead,ul" # Could be passed to {@link #setRemoveSurroundingSpaces(String) setRemoveSurroundingSpaces} method # to remove all surrounding spaces (not recommended). ALL_TAGS = "all" # temp replacements for preserved blocks TEMP_COND_COMMENT_BLOCK = "%%%~COMPRESS~COND~{0,number,#}~%%%" TEMP_PRE_BLOCK = "%%%~COMPRESS~PRE~{0,number,#}~%%%" TEMP_TEXT_AREA_BLOCK = "%%%~COMPRESS~TEXTAREA~{0,number,#}~%%%" TEMP_SCRIPT_BLOCK = "%%%~COMPRESS~SCRIPT~{0,number,#}~%%%" TEMP_STYLE_BLOCK = "%%%~COMPRESS~STYLE~{0,number,#}~%%%" TEMP_EVENT_BLOCK = "%%%~COMPRESS~EVENT~{0,number,#}~%%%" TEMP_LINE_BREAK_BLOCK = "%%%~COMPRESS~LT~{0,number,#}~%%%" TEMP_SKIP_BLOCK = "%%%~COMPRESS~SKIP~{0,number,#}~%%%" TEMP_USER_BLOCK = "%%%~COMPRESS~USER{0,number,#}~{1,number,#}~%%%" # compiled regex patterns EMPTY_PATTERN ="\\s") SKIP_PATTERN ="(.*?)", Regexp::MULTILINE | Regexp::IGNORECASE) COND_COMMENT_PATTERN ="()(.*?)()", Regexp::MULTILINE | Regexp::IGNORECASE) COMMENT_PATTERN ="|", Regexp::MULTILINE | Regexp::IGNORECASE) INTERTAG_PATTERN_TAG_TAG =">\\s+<", Regexp::MULTILINE | Regexp::IGNORECASE) INTERTAG_PATTERN_TAG_CUSTOM =">\\s+%%%~", Regexp::MULTILINE | Regexp::IGNORECASE) INTERTAG_PATTERN_CUSTOM_TAG ="~%%%\\s+<", Regexp::MULTILINE | Regexp::IGNORECASE) INTERTAG_PATTERN_CUSTOM_CUSTOM ="~%%%\\s+%%%~", Regexp::MULTILINE | Regexp::IGNORECASE) MULTISPACE_PATTERN ="\\s+", Regexp::MULTILINE | Regexp::IGNORECASE) TAG_END_SPACE_PATTERN ="(<(?:[^>]+?))(?:\\s+?)(/?>)", Regexp::MULTILINE | Regexp::IGNORECASE) TAG_LAST_UNQUOTED_VALUE_PATTERN ="=\\s*[a-z0-9\\-_]+$", Regexp::IGNORECASE) TAG_QUOTE_PATTERN ="\\s*=\\s*([\"'])([a-z0-9\\-_]+?)\\1(/?)(?=[^<]*?>)", Regexp::IGNORECASE) PRE_PATTERN ="(]*?>)(.*?)()", Regexp::MULTILINE | Regexp::IGNORECASE) TA_PATTERN ="(]*?>)(.*?)()", Regexp::MULTILINE | Regexp::IGNORECASE) SCRIPT_PATTERN ="(]*?>)(.*?)()", Regexp::MULTILINE | Regexp::IGNORECASE) STYLE_PATTERN ="(]*?>)(.*?)()", Regexp::MULTILINE | Regexp::IGNORECASE) TAG_PROPERTY_PATTERN ="(\\s\\w+)\\s*=\\s*(?=[^<]*?>)", Regexp::IGNORECASE) CDATA_PATTERN ="\\s*\\s*", Regexp::MULTILINE | Regexp::IGNORECASE) DOCTYPE_PATTERN ="]*>", Regexp::MULTILINE | Regexp::IGNORECASE) TYPE_ATTR_PATTERN ="type\\s*=\\s*([\\\"']*)(.+?)\\1", Regexp::MULTILINE | Regexp::IGNORECASE) JS_TYPE_ATTR_PATTERN ="(]*)type\\s*=\\s*([\"']*)(?:text|application)\/javascript\\2([^>]*>)", Regexp::MULTILINE | Regexp::IGNORECASE) JS_LANG_ATTR_PATTERN ="(]*)language\\s*=\\s*([\"']*)javascript\\2([^>]*>)", Regexp::MULTILINE | Regexp::IGNORECASE) STYLE_TYPE_ATTR_PATTERN ="(]*)type\\s*=\\s*([\"']*)text/style\\2([^>]*>)", Regexp::MULTILINE | Regexp::IGNORECASE) LINK_TYPE_ATTR_PATTERN ="(]*)type\\s*=\\s*([\"']*)text/(?:css|plain)\\2([^>]*>)", Regexp::MULTILINE | Regexp::IGNORECASE) LINK_REL_ATTR_PATTERN ="]*)rel\\s*=\\s*([\"']*)(?:alternate\\s+)?stylesheet\\1(?:[^>]*)>", Regexp::MULTILINE | Regexp::IGNORECASE) FORM_METHOD_ATTR_PATTERN ="(]*)method\\s*=\\s*([\"']*)get\\2([^>]*>)", Regexp::MULTILINE | Regexp::IGNORECASE) INPUT_TYPE_ATTR_PATTERN ="(]*)type\\s*=\\s*([\"']*)text\\2([^>]*>)", Regexp::MULTILINE | Regexp::IGNORECASE) BOOLEAN_ATTR_PATTERN ="(<\\w+[^>]*)(checked|selected|disabled|readonly)\\s*=\\s*([\"']*)\\w*\\3([^>]*>)", Regexp::MULTILINE | Regexp::IGNORECASE) EVENT_JS_PROTOCOL_PATTERN ="^javascript:\\s*(.+)", Regexp::MULTILINE | Regexp::IGNORECASE) HTTP_PROTOCOL_PATTERN ="(<[^>]+?(?:href|src|cite|action)\\s*=\\s*['\"])http:(//[^>]+?>)", Regexp::MULTILINE | Regexp::IGNORECASE) HTTPS_PROTOCOL_PATTERN ="(<[^>]+?(?:href|src|cite|action)\\s*=\\s*['\"])https:(//[^>]+?>)", Regexp::MULTILINE | Regexp::IGNORECASE) REL_EXTERNAL_PATTERN ="<(?:[^>]*)rel\\s*=\\s*([\"']*)(?:alternate\\s+)?external\\1(?:[^>]*)>", Regexp::MULTILINE | Regexp::IGNORECASE) EVENT_PATTERN1 ="(\\son[a-z]+\\s*=\\s*\")([^\"\\\\\\r\\n]*(?:\\\\.[^\"\\\\\\r\\n]*)*)(\")", Regexp::IGNORECASE) # unmasked: \son[a-z]+\s*=\s*"[^"\\\r\n]*(?:\\.[^"\\\r\n]*)*"" EVENT_PATTERN2 ="(\\son[a-z]+\\s*=\\s*')([^'\\\\\\r\\n]*(?:\\\\.[^'\\\\\\r\\n]*)*)(')", Regexp::IGNORECASE) LINE_BREAK_PATTERN ="(?:\\p{Blank}*(\\r?\\n)\\p{Blank}*)+") SURROUNDING_SPACES_MIN_PATTERN ="\\s*(|[\\s/][^>]*>))\\s*", Regexp::MULTILINE | Regexp::IGNORECASE) SURROUNDING_SPACES_MAX_PATTERN ="\\s*(|[\\s/][^>]*>))\\s*", Regexp::MULTILINE | Regexp::IGNORECASE) SURROUNDING_SPACES_ALL_PATTERN ="\\s*(<[^>]+>)\\s*", Regexp::MULTILINE | Regexp::IGNORECASE) # patterns for searching for temporary replacements TEMP_COND_COMMENT_PATTERN ="%%%~COMPRESS~COND~(\\d+?)~%%%") TEMP_PRE_PATTERN ="%%%~COMPRESS~PRE~(\\d+?)~%%%") TEMP_TEXT_AREA_PATTERN ="%%%~COMPRESS~TEXTAREA~(\\d+?)~%%%") TEMP_SCRIPT_PATTERN ="%%%~COMPRESS~SCRIPT~(\\d+?)~%%%") TEMP_STYLE_PATTERN ="%%%~COMPRESS~STYLE~(\\d+?)~%%%") TEMP_EVENT_PATTERN ="%%%~COMPRESS~EVENT~(\\d+?)~%%%") TEMP_SKIP_PATTERN ="%%%~COMPRESS~SKIP~(\\d+?)~%%%") TEMP_LINE_BREAK_PATTERN ="%%%~COMPRESS~LT~(\\d+?)~%%%") DEFAULT_OPTIONS = { :enabled => true, # default settings :remove_comments => true, :remove_multi_spaces => true, # optional settings :remove_intertag_spaces => false, :remove_quotes => false, :compress_javascript => false, :compress_css => false, :simple_doctype => false, :remove_script_attributes => false, :remove_style_attributes => false, :remove_link_attributes => false, :remove_form_attributes => false, :remove_input_attributes => false, :simple_boolean_attributes => false, :remove_javascript_protocol => false, :remove_http_protocol => false, :remove_https_protocol => false, :preserve_line_breaks => false, :remove_surrounding_spaces => nil, :preserve_patterns => nil, :javascript_compressor => nil, :css_compressor => nil } def initialize(options = {}) @options = DEFAULT_OPTIONS.merge(options) # YUICompressor settings @yuiCssLineBreak = -1 @yuiJsNoMunge = false @yuiJsPreserveAllSemiColons = false @yuiJsDisableOptimizations = false end def compress html if not @options[:enabled] or html.nil? or html.length == 0 return html end # preserved block containers condCommentBlocks = [] preBlocks = [] taBlocks = [] scriptBlocks = [] styleBlocks = [] eventBlocks = [] skipBlocks = [] lineBreakBlocks = [] userBlocks = [] # preserve blocks html = preserve_blocks(html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks) # process pure html html = process_html(html) # process preserved blocks process_preserved_blocks(preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks) # put preserved blocks back html = return_blocks(html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks) html end private def preserve_blocks(html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks) # preserve user blocks preservePatterns = @options[:preserve_patterns] unless (preservePatterns.nil?) preservePatterns.each_with_index do |preservePattern, i| userBlock = [] index = -1 html = html.gsub(preservePattern) do |match| if match.strip.length > 0 userBlock << match index += 1 message_format(TEMP_USER_BLOCK, i, index) else '' end end userBlocks << userBlock end end # preserve skip blocks skipBlockIndex = -1 html = html.gsub(SKIP_PATTERN) do |match| if $1.strip.length > 0 skipBlocks << match skipBlockIndex += 1 message_format(TEMP_SKIP_BLOCK, skipBlockIndex) else match end end # preserve conditional comments condCommentCompressor = self.clone index = -1 html = html.gsub(COND_COMMENT_PATTERN) do |match| if $2.strip.length > 0 index += 1 condCommentBlocks << ($1 + condCommentCompressor.compress($2) + $3) message_format(TEMP_COND_COMMENT_BLOCK, index) else '' end end # preserve inline events index = -1 html = html.gsub(EVENT_PATTERN1) do |match| if $2.strip.length > 0 eventBlocks << $2 index += 1 $1 + message_format(TEMP_EVENT_BLOCK, index) + $3 else '' end end html = html.gsub(EVENT_PATTERN2) do |match| if $2.strip.length > 0 eventBlocks << $2 index += 1 $1 + message_format(TEMP_EVENT_BLOCK, index) + $3 else '' end end # preserve PRE tags index = -1 html = html.gsub PRE_PATTERN do |match| if $2.strip.length > 0 index += 1 preBlocks << $2 $1 + message_format(TEMP_PRE_BLOCK, index) + $3 else '' end end # preserve SCRIPT tags index = -1 html = html.gsub(SCRIPT_PATTERN) do |match| group_1 = $1 group_2 = $2 group_3 = $3 # ignore empty scripts if group_2.strip.length > 0 # check type type = "" if group_1 =~ TYPE_ATTR_PATTERN type = $2.downcase end if type.length == 0 or type == 'text/javascript' or type == 'application/javascript' # javascript block, preserve and compress with js compressor scriptBlocks << group_2 index += 1 group_1 + message_format(TEMP_SCRIPT_BLOCK, index) + group_3 elsif type == 'text/x-jquery-tmpl' # jquery template, ignore so it gets compressed with the rest of html match else # some custom script, preserve it inside "skip blocks" so it won't be compressed with js compressor skipBlocks << group_2 skipBlockIndex += 1 group_1 + message_format(TEMP_SKIP_BLOCK, skipBlockIndex) + group_3 end else match end end # preserve STYLE tags index = -1 html = html.gsub(STYLE_PATTERN) do |match| if $2.strip.length > 0 styleBlocks << $2 index += 1 $1 + message_format(TEMP_STYLE_BLOCK, index) + $3 else match end end # preserve TEXTAREA tags index = -1 html = html.gsub(TA_PATTERN) do |match| if $2.strip.length > 0 taBlocks << $2 index += 1 $1 + message_format(TEMP_TEXT_AREA_BLOCK, index) + $3 else '' end end # preserve line breaks if @options[:preserve_line_breaks] index = -1 html = html.gsub(LINE_BREAK_PATTERN) do |match| lineBreakBlocks << $1 index += 1 message_format(TEMP_LINE_BREAK_BLOCK, index) end end html end def return_blocks(html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks) # put line breaks back if @options[:preserve_line_breaks] html = html.gsub(TEMP_LINE_BREAK_PATTERN) do |match| i = $1.to_i if lineBreakBlocks.size > i lineBreakBlocks[i] else '' end end end # put TEXTAREA blocks back html = html.gsub(TEMP_TEXT_AREA_PATTERN) do |match| i = $1.to_i if taBlocks.size > i taBlocks[i] else '' end end # put STYLE blocks back html = html.gsub(TEMP_STYLE_PATTERN) do |match| i = $1.to_i if styleBlocks.size > i styleBlocks[i] else '' end end # put SCRIPT blocks back html = html.gsub(TEMP_SCRIPT_PATTERN) do |match| i = $1.to_i if scriptBlocks.size > i scriptBlocks[i] end end # put PRE blocks back html = html.gsub TEMP_PRE_PATTERN do |match| i = $1.to_i if preBlocks.size > i preBlocks[i] # quoteReplacement ? else '' end end # put event blocks back html = html.gsub(TEMP_EVENT_PATTERN) do |match| i = $1.to_i if eventBlocks.size > i eventBlocks[i] else '' end end # put conditional comments back html = html.gsub(TEMP_COND_COMMENT_PATTERN) do |match| i = $1.to_i if condCommentBlocks.size > i condCommentBlocks[i] # quoteReplacement ? else '' end end # put skip blocks back html = html.gsub(TEMP_SKIP_PATTERN) do |match| i = $1.to_i if skipBlocks.size > i skipBlocks[i] else '' end end # put user blocks back unless @options[:preserve_patterns].nil? @options[:preserve_patterns].each_with_index do |preservePattern, p| tempUserPattern ="%%%~COMPRESS~USER#{p}~(\\d+?)~%%%") html = html.gsub(tempUserPattern).each do |match| i = $1.to_i if userBlocks.size > p and userBlocks[p].size > i userBlocks[p][i] else '' end end end end html end def process_preserved_blocks(preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks) # processPreBlocks(preBlocks) # processTextAreaBlocks(taBlocks) process_script_blocks(scriptBlocks) process_style_blocks(styleBlocks) process_event_blocks(eventBlocks) # processCondCommentBlocks(condCommentBlocks) # processSkipBlocks(skipBlocks) # processUserBlocks(userBlocks) # processLineBreakBlocks(lineBreakBlocks) end def process_script_blocks(scriptBlocks) if @options[:compress_javascript]! do |block| compress_javascript(block) end end end def process_style_blocks(styleBlocks) if @options[:compress_css]! do |block| compress_css_styles(block) end end end def process_event_blocks(eventBlocks) if @options[:remove_javascript_protocol]! do |block| remove_javascript_protocol(block) end end end def compress_javascript(source) # set default javascript compressor javaScriptCompressor = @options[:javascript_compressor] if javaScriptCompressor.nil? javaScriptCompressor = :munge => !@yuiJsNoMunge, :preserve_semicolons => !@yuiJsDisableOptimizations, :optimize => !@yuiJsDisableOptimizations, :line_break => @yuiJsLineBreak ) end # detect CDATA wrapper cdataWrapper = false if source =~ CDATA_PATTERN cdataWrapper = true source = $1 end result = javaScriptCompressor.compress(source).strip if cdataWrapper result = "" end result end def compress_css_styles(source) # set default css compressor cssCompressor = @options[:css_compressor] if cssCompressor.nil? cssCompressor = => @yuiCssLineBreak) end # detect CDATA wrapper cdataWrapper = false if source =~ CDATA_PATTERN cdataWrapper = true source = $1 end result = cssCompressor.compress(source) if cdataWrapper result = "" end result end def remove_javascript_protocol(source) # remove javascript: from inline events source.sub(EVENT_JS_PROTOCOL_PATTERN, '\1') end def process_html(html) # remove comments html = remove_comments(html) # simplify doctype html = simple_doctype(html) # remove script attributes html = remove_script_attributes(html) # remove style attributes html = remove_style_attributes(html) # remove link attributes html = remove_link_attributes(html) # remove form attributes html = remove_form_attributes(html) # remove input attributes html = remove_input_attributes(html) # simplify boolean attributes html = simple_boolean_attributes(html) # remove http from attributes html = remove_http_protocol(html) # remove https from attributes html = remove_https_protocol(html) # remove inter-tag spaces html = remove_intertag_spaces(html) # remove multi whitespace characters html = remove_multi_spaces(html) # remove spaces around equals sign and ending spaces html = remove_spaces_inside_tags(html) # remove quotes from tag attributes html = remove_quotes_inside_tags(html) # # remove surrounding spaces html = remove_surrounding_spaces(html) html.strip end def remove_comments(html) # remove comments if @options[:remove_comments] html = html.gsub(COMMENT_PATTERN, '') end html end def simple_doctype(html) # simplify doctype if @options[:simple_doctype] html = html.gsub(DOCTYPE_PATTERN, '') end html end def remove_script_attributes(html) if @options[:remove_script_attributes] #remove type from script tags html = html.gsub(JS_TYPE_ATTR_PATTERN, '\1\3') #remove language from script tags html = html.gsub(JS_LANG_ATTR_PATTERN, '\1\3') end html end def remove_style_attributes(html) # remove type from style tags if @options[:remove_style_attributes] html = html.gsub(STYLE_TYPE_ATTR_PATTERN, '\1\3') end html end def remove_link_attributes(html) # remove type from link tags with rel=stylesheet if @options[:remove_link_attributes] html = html.gsub(LINK_TYPE_ATTR_PATTERN) do |match| group_1 = $1 group_3 = $3 # if rel=stylesheet if match =~ LINK_REL_ATTR_PATTERN group_1 + group_3 else match end end end html end def remove_form_attributes(html) # remove method from form tags if @options[:remove_form_attributes] html = html.gsub(FORM_METHOD_ATTR_PATTERN, '\1\3') end html end def remove_input_attributes(html) # remove type from input tags if @options[:remove_input_attributes] html = html.gsub(INPUT_TYPE_ATTR_PATTERN, '\1\3') end html end def remove_http_protocol(html) # remove http protocol from tag attributes if @options[:remove_http_protocol] html = html.gsub(HTTP_PROTOCOL_PATTERN) do |match| group_1 = $1 group_2 = $2 if match =~ REL_EXTERNAL_PATTERN match else "#{group_1}#{group_2}" end end end html end def remove_https_protocol(html) # remove https protocol from tag attributes if @options[:remove_https_protocol] html = html.gsub(HTTPS_PROTOCOL_PATTERN) do |match| group_1 = $1 group_2 = $2 if match =~ REL_EXTERNAL_PATTERN match else "#{group_1}#{group_2}" end end end html end def remove_intertag_spaces(html) # remove inter-tag spaces if @options[:remove_intertag_spaces] html = html.gsub(INTERTAG_PATTERN_TAG_TAG, '><') html = html.gsub(INTERTAG_PATTERN_TAG_CUSTOM, '>%%%~') html = html.gsub(INTERTAG_PATTERN_CUSTOM_TAG, '~%%%<') html = html.gsub(INTERTAG_PATTERN_CUSTOM_CUSTOM, '~%%%%%%~') end html end def remove_spaces_inside_tags(html) #remove spaces around equals sign inside tags html = html.gsub(TAG_PROPERTY_PATTERN, '\1=') #remove ending spaces inside tags html.gsub!(TAG_END_SPACE_PATTERN) do |match| group_1 = $1 group_2 = $2 # keep space if attribute value is unquoted before trailing slash if group_2.start_with?("/") and (TAG_LAST_UNQUOTED_VALUE_PATTERN =~ group_1) "#{group_1} #{group_2}" else "#{group_1}#{group_2}" end end html end def remove_quotes_inside_tags(html) if @options[:remove_quotes] html = html.gsub(TAG_QUOTE_PATTERN) do |match| # if quoted attribute is followed by "/" add extra space if $3.strip.length == 0 "=#{$2}" else "=#{$2} #{$3}" end end end html end def remove_multi_spaces(html) # collapse multiple spaces if @options[:remove_multi_spaces] html = html.gsub(MULTISPACE_PATTERN, ' ') end html end def simple_boolean_attributes(html) # simplify boolean attributes if @options[:simple_boolean_attributes] html = html.gsub(BOOLEAN_ATTR_PATTERN, '\1\2\4') end html end def remove_surrounding_spaces(html) # remove spaces around provided tags unless @options[:remove_surrounding_spaces].nil? pattern = case @options[:remove_surrounding_spaces].downcase when BLOCK_TAGS_MIN SURROUNDING_SPACES_MIN_PATTERN when BLOCK_TAGS_MAX SURROUNDING_SPACES_MAX_PATTERN when ALL_TAGS SURROUNDING_SPACES_ALL_PATTERN else"\\s*(|[\\s/][^>]*>))\\s*", Regexp::MULTILINE | Regexp::IGNORECASE) end html = html.gsub(pattern, '\1') end html end private def message_format(message, *params) message.gsub(/\{(\d+),number,#\}/) do params[$1.to_i] end end end end