# escape.rb - escape/unescape library for several formats # # Copyright (C) 2006,2007 Tanaka Akira # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT # OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY # OF SUCH DAMAGE. # Escape module provides several escape functions. # * URI # * HTML # * shell command module EscapeUtil module_function class StringWrapper class << self alias new_no_dup new def new(str) new_no_dup(str.dup) end end def initialize(str) @str = str end def to_s @str.dup end def inspect "\#<#{self.class}: #{@str}>" end def ==(other) other.class == self.class && @str == other.instance_variable_get(:@str) end alias eql? == def hash @str.hash end end class ShellEscaped < StringWrapper end # Escape.shell_command composes # a sequence of words to # a single shell command line. # All shell meta characters are quoted and # the words are concatenated with interleaving space. # It returns an instance of ShellEscaped. # # Escape.shell_command(["ls", "/"]) #=> # # Escape.shell_command(["echo", "*"]) #=> # # # Note that system(*command) and # system(Escape.shell_command(command)) is roughly same. # There are two exception as follows. # * The first is that the later may invokes /bin/sh. # * The second is an interpretation of an array with only one element: # the element is parsed by the shell with the former but # it is recognized as single word with the later. # For example, system(*["echo foo"]) invokes echo command with an argument "foo". # But system(Escape.shell_command(["echo foo"])) invokes "echo foo" command without arguments (and it probably fails). def shell_command(command) s = command.map {|word| shell_single_word(word) }.join(' ') ShellEscaped.new_no_dup(s) end # Escape.shell_single_word quotes shell meta characters. # It returns an instance of ShellEscaped. # # The result string is always single shell word, even if # the argument is "". # Escape.shell_single_word("") returns #. # # Escape.shell_single_word("") #=> # # Escape.shell_single_word("foo") #=> # # Escape.shell_single_word("*") #=> # def shell_single_word(str) if str && str.empty? ShellEscaped.new_no_dup("''") elsif %r{\A[0-9A-Za-z+,./:=@_-]+\z} =~ str ShellEscaped.new(str) else result = '' str.scan(/('+)|[^']+/) { if $1 result << %q{\'} * $1.length else result << "'#{$&}'" end } ShellEscaped.new_no_dup(result) end end class PercentEncoded < StringWrapper end # Escape.uri_segment escapes URI segment using percent-encoding. # It returns an instance of PercentEncoded. # # Escape.uri_segment("a/b") #=> # # # The segment is "/"-splitted element after authority before query in URI, as follows. # # scheme://authority/segment1/segment2/.../segmentN?query#fragment # # See RFC 3986 for details of URI. def uri_segment(str) # pchar - pct-encoded = unreserved / sub-delims / ":" / "@" # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" s = str.gsub(%r{[^A-Za-z0-9\-._~!$&'()*+,;=:@]}n) { '%' + $&.unpack("H2")[0].upcase } PercentEncoded.new_no_dup(s) end # Escape.uri_path escapes URI path using percent-encoding. # The given path should be a sequence of (non-escaped) segments separated by "/". # The segments cannot contains "/". # It returns an instance of PercentEncoded. # # Escape.uri_path("a/b/c") #=> # # Escape.uri_path("a?b/c?d/e?f") #=> # # # The path is the part after authority before query in URI, as follows. # # scheme://authority/path#fragment # # See RFC 3986 for details of URI. # # Note that this function is not appropriate to convert OS path to URI. def uri_path(str) s = str.gsub(%r{[^/]+}n) { uri_segment($&) } PercentEncoded.new_no_dup(s) end # :stopdoc: def html_form_fast(pairs, sep='&') s = pairs.map {|k, v| # query-chars - pct-encoded - x-www-form-urlencoded-delimiters = # unreserved / "!" / "$" / "'" / "(" / ")" / "*" / "," / ":" / "@" / "/" / "?" # query-char - pct-encoded = unreserved / sub-delims / ":" / "@" / "/" / "?" # query-char = pchar / "/" / "?" = unreserved / pct-encoded / sub-delims / ":" / "@" / "/" / "?" # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" # x-www-form-urlencoded-delimiters = "&" / "+" / ";" / "=" k = k.gsub(%r{[^0-9A-Za-z\-\._~:/?@!\$'()*,]}n) { '%' + $&.unpack("H2")[0].upcase } v = v.gsub(%r{[^0-9A-Za-z\-\._~:/?@!\$'()*,]}n) { '%' + $&.unpack("H2")[0].upcase } "#{k}=#{v}" }.join(sep) PercentEncoded.new_no_dup(s) end # :startdoc: # Escape.html_form composes HTML form key-value pairs as a x-www-form-urlencoded encoded string. # It returns an instance of PercentEncoded. # # Escape.html_form takes an array of pair of strings or # an hash from string to string. # # Escape.html_form([["a","b"], ["c","d"]]) #=> # # Escape.html_form({"a"=>"b", "c"=>"d"}) #=> # # # In the array form, it is possible to use same key more than once. # (It is required for a HTML form which contains # checkboxes and select element with multiple attribute.) # # Escape.html_form([["k","1"], ["k","2"]]) #=> # # # If the strings contains characters which must be escaped in x-www-form-urlencoded, # they are escaped using %-encoding. # # Escape.html_form([["k=","&;="]]) #=> # # # The separator can be specified by the optional second argument. # # Escape.html_form([["a","b"], ["c","d"]], ";") #=> # # # See HTML 4.01 for details. def html_form(pairs, sep='&') r = '' first = true pairs.each {|k, v| # query-chars - pct-encoded - x-www-form-urlencoded-delimiters = # unreserved / "!" / "$" / "'" / "(" / ")" / "*" / "," / ":" / "@" / "/" / "?" # query-char - pct-encoded = unreserved / sub-delims / ":" / "@" / "/" / "?" # query-char = pchar / "/" / "?" = unreserved / pct-encoded / sub-delims / ":" / "@" / "/" / "?" # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" # x-www-form-urlencoded-delimiters = "&" / "+" / ";" / "=" r << sep if !first first = false k.each_byte {|byte| ch = byte.chr if %r{[^0-9A-Za-z\-\._~:/?@!\$'()*,]}n =~ ch r << "%" << ch.unpack("H2")[0].upcase else r << ch end } r << '=' v.each_byte {|byte| ch = byte.chr if %r{[^0-9A-Za-z\-\._~:/?@!\$'()*,]}n =~ ch r << "%" << ch.unpack("H2")[0].upcase else r << ch end } } PercentEncoded.new_no_dup(r) end class HTMLEscaped < StringWrapper end # :stopdoc: HTML_TEXT_ESCAPE_HASH = { '&' => '&', '<' => '<', '>' => '>', } # :startdoc: # Escape.html_text escapes a string appropriate for HTML text using character references. # It returns an instance of HTMLEscaped. # # It escapes 3 characters: # * '&' to '&' # * '<' to '<' # * '>' to '>' # # Escape.html_text("abc") #=> # # Escape.html_text("a & b < c > d") #=> # # # This function is not appropriate for escaping HTML element attribute # because quotes are not escaped. def html_text(str) s = str.gsub(/[&<>]/) {|ch| HTML_TEXT_ESCAPE_HASH[ch] } HTMLEscaped.new_no_dup(s) end # :stopdoc: HTML_ATTR_ESCAPE_HASH = { '&' => '&', '<' => '<', '>' => '>', '"' => '"', } # :startdoc: class HTMLAttrValue < StringWrapper end # Escape.html_attr_value encodes a string as a double-quoted HTML attribute using character references. # It returns an instance of HTMLAttrValue. # # Escape.html_attr_value("abc") #=> # # Escape.html_attr_value("a&b") #=> # # Escape.html_attr_value("ab&<>\"c") #=> # # Escape.html_attr_value("a'c") #=> # # # It escapes 4 characters: # * '&' to '&' # * '<' to '<' # * '>' to '>' # * '"' to '"' # def html_attr_value(str) s = '"' + str.gsub(/[&<>"]/) {|ch| HTML_ATTR_ESCAPE_HASH[ch] } + '"' HTMLAttrValue.new_no_dup(s) end end