require "HTML/Encoder/Unicode.rb" if RUBY_VERSION > '1.8.7' module HTML class Encoder def initialize() @entity2char = { 'amp' => '&', # ampersand 'gt' => '>', # greater than 'lt' => '<', # less than 'quot' => '"', # double quote 'apos' => "'", # single quote # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML 'AElig' => 198.chr, # capital AE diphthong (ligature) 'Aacute' => 193.chr, # capital A, acute accent 'Acirc' => 194.chr, # capital A, circumflex accent 'Agrave' => 192.chr, # capital A, grave accent 'Aring' => 197.chr, # capital A, ring 'Atilde' => 195.chr, # capital A, tilde 'Auml' => 196.chr, # capital A, dieresis or umlaut mark 'Ccedil' => 199.chr, # capital C, cedilla 'ETH' => 208.chr, # capital Eth, Icelandic 'Eacute' => 201.chr, # capital E, acute accent 'Ecirc' => 202.chr, # capital E, circumflex accent 'Egrave' => 200.chr, # capital E, grave accent 'Euml' => 203.chr, # capital E, dieresis or umlaut mark 'Iacute' => 205.chr, # capital I, acute accent 'Icirc' => 206.chr, # capital I, circumflex accent 'Igrave' => 204.chr, # capital I, grave accent 'Iuml' => 207.chr, # capital I, dieresis or umlaut mark 'Ntilde' => 209.chr, # capital N, tilde 'Oacute' => 211.chr, # capital O, acute accent 'Ocirc' => 212.chr, # capital O, circumflex accent 'Ograve' => 210.chr, # capital O, grave accent 'Oslash' => 216.chr, # capital O, slash 'Otilde' => 213.chr, # capital O, tilde 'Ouml' => 214.chr, # capital O, dieresis or umlaut mark 'THORN' => 222.chr, # capital THORN, Icelandic 'Uacute' => 218.chr, # capital U, acute accent 'Ucirc' => 219.chr, # capital U, circumflex accent 'Ugrave' => 217.chr, # capital U, grave accent 'Uuml' => 220.chr, # capital U, dieresis or umlaut mark 'Yacute' => 221.chr, # capital Y, acute accent 'aacute' => 225.chr, # small a, acute accent 'acirc' => 226.chr, # small a, circumflex accent 'aelig' => 230.chr, # small ae diphthong (ligature) 'agrave' => 224.chr, # small a, grave accent 'aring' => 229.chr, # small a, ring 'atilde' => 227.chr, # small a, tilde 'auml' => 228.chr, # small a, dieresis or umlaut mark 'ccedil' => 231.chr, # small c, cedilla 'eacute' => 233.chr, # small e, acute accent 'ecirc' => 234.chr, # small e, circumflex accent 'egrave' => 232.chr, # small e, grave accent 'eth' => 240.chr, # small eth, Icelandic 'euml' => 235.chr, # small e, dieresis or umlaut mark 'iacute' => 237.chr, # small i, acute accent 'icirc' => 238.chr, # small i, circumflex accent 'igrave' => 236.chr, # small i, grave accent 'iuml' => 239.chr, # small i, dieresis or umlaut mark 'ntilde' => 241.chr, # small n, tilde 'oacute' => 243.chr, # small o, acute accent 'ocirc' => 244.chr, # small o, circumflex accent 'ograve' => 242.chr, # small o, grave accent 'oslash' => 248.chr, # small o, slash 'otilde' => 245.chr, # small o, tilde 'ouml' => 246.chr, # small o, dieresis or umlaut mark 'szlig' => 223.chr, # small sharp s, German (sz ligature) 'thorn' => 254.chr, # small thorn, Icelandic 'uacute' => 250.chr, # small u, acute accent 'ucirc' => 251.chr, # small u, circumflex accent 'ugrave' => 249.chr, # small u, grave accent 'uuml' => 252.chr, # small u, dieresis or umlaut mark 'yacute' => 253.chr, # small y, acute accent 'yuml' => 255.chr, # small y, dieresis or umlaut mark # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) 'copy' => 169.chr, # copyright sign 'reg' => 174.chr, # registered sign 'nbsp' => 160.chr, # non breaking space # Additional ISO-8859/1 entities listed in rfc1866 (section 14) 'iexcl' => 161.chr, 'cent' => 162.chr, 'pound' => 163.chr, 'curren' => 164.chr, 'yen' => 165.chr, 'brvbar' => 166.chr, 'sect' => 167.chr, 'uml' => 168.chr, 'ordf' => 170.chr, 'laquo' => 171.chr, 'not' => 172.chr, 'shy' => 173.chr, 'macr' => 175.chr, 'deg' => 176.chr, 'plusmn' => 177.chr, 'sup1' => 185.chr, 'sup2' => 178.chr, 'sup3' => 179.chr, 'acute' => 180.chr, 'micro' => 181.chr, 'para' => 182.chr, 'middot' => 183.chr, 'cedil' => 184.chr, 'ordm' => 186.chr, 'raquo' => 187.chr, 'frac14' => 188.chr, 'frac12' => 189.chr, 'frac34' => 190.chr, 'iquest' => 191.chr, 'times' => 215.chr, 'divide' => 247.chr, } if RUBY_VERSION > '1.8.7' HTML::Encoder::Unicode.unicode_mapping.each{ |k,v| @entity2char[k] = v } end @char2entity = Hash[@entity2char.map { |k, v| [v, "&#{k};"] }] for i in 0..255 unless @char2entity.has_key?( i.chr ) @char2entity[i.chr] = "&##{i};"; end end end def encode( string, *args ) if (! args[0].nil? and ! args[0].to_s.empty?) lookup = {} args[0].to_s.each_char{ |c| lookup[c] = @char2entity[c].nil? ? num_entity(c) : @char2entity[c] } string = string.to_s.gsub( /./ ) {|c| lookup[c].nil? ? c : lookup[c] } else # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' string = string.to_s.gsub( /([^\n\r\t !\#\$%\(-;=?-~])/ ) {|c| @char2entity[c].nil? ? num_entity(c) : @char2entity[c] } end return string end def encode_hex( *args ) tmp = @char2entity @char2entity = {} string = encode( *args ) @char2entity = tmp return string end def num_entity( char ) return sprintf( '&#x%X;', char.unpack('C')[0] ) end end end =begin rdoc == INSPIRATION This code is heavily borrowed from Gisle Aas's CPAN module HTML::Entities. == AUTHOR Jeff Anderson, == LICENSE AND COPYRIGHT Copyright 2017 Jeff Anderson. (See License.md shipped with distro) =end