-- Copyright (C) 2015 Immunio, Inc. -- HTML: Simple h5 like HTML lexer for Immun.io. -- NOTE: not covered by Scintillua MIT license in this directory. local l = require('lexer') local token, parent_token, word_match = l.token, l.parent_token, l.word_match local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V local M = {_NAME = 'html'} local case_insensitive_tags = true -- Whitespace. local ws = l.space^1 -- This is broad to both accept our placeholders and be very liberal about what may be -- interpreted as an attribute to ensure we escape attributes fairly aggressively. local element_chars = (l.any - '<' - '>' - '=' - '"' - "'" - ws)^1 -- Comments. local comment = token(l.COMMENT, '')^0 * P('-->')) -- IE Conditional Comments. local ie_condcomment_hidden_open = token(l.COMMENT, P('')^0 * P(']-->')) local ie_condcomment_revealed = token(l.COMMENT, P('')^0 * P('>')) local condcomment = token('condcomment', ie_condcomment_hidden_open + ie_condcomment_hidden_close + ie_condcomment_revealed) -- Strings. local sq_str = l.delimited_range("'") local dq_str = l.delimited_range('"') local string = sq_str + dq_str -- Attributes. Individual recognition is handled in our XSS processing code. local attr_name = token('attr_name', element_chars - '=') local attr_value = token('attr_value', string + element_chars) local attribute = parent_token('attribute', attr_name * '=' * attr_value) -- Tags. local tag_name = token('tag_name', element_chars - '/') local tag_data = token('tag_data', (l.any - l.space - '>')^1 ) -- crap in a tag -- XXX how should we handle void tags... right now they are an unmatched tag_open local tag_open = parent_token('tag_open', P('<') * tag_name * ( (ws * attribute) + ( tag_data ) + ws )^0 * (P('>') + '/>') ) local tag_close = parent_token('tag_close', P('') -- Special case for script and style tags. local style_tag_name = token("tag_name", word_match({'style'}, nil, case_insensitive_tags)) local style_tag_open = parent_token("tag_open", P('<') * style_tag_name * ((ws * attribute) + tag_data)^0 * P('>')) local style_tag_close = parent_token("tag_close", P('') local style_data = token("style_data", (l.any - style_tag_close)^0) local style_tag = parent_token('style_tag', style_tag_open * style_data * style_tag_close) local script_tag_name = token("tag_name", word_match({'script'}, nil, case_insensitive_tags)) local script_tag_open = parent_token("tag_open", P('<') * script_tag_name * ((ws * attribute) + tag_data)^0 * P('>')) local script_tag_close = parent_token("tag_close", P('') local script_data = token("script_data", (l.any - script_tag_close)^0) local script_tag = parent_token('script_tag', script_tag_open * script_data * script_tag_close) -- Top level rules -- Note: the ordering is important here as