/*
* @(#)StripEntities.java
*
* Summary: Strips HTML entities such as " from a string, replacing them by their Unicode equivalents.
*
* Copyright: (c) 2002-2009 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.5+
*
* Created with: IntelliJ IDEA IDE.
*
* Version History:
* 2.6 2009-04-05 - StripEntities now leaves a space behind when it removes a
etc tag.
*/
package toxi.data.feeds.util;
import java.util.HashMap;
/**
* Strips HTML entities such as " from a string, replacing them by their
* Unicode equivalents.
*
* @author Roedy Green, Canadian Mind Products
* @version 2.6 2009-04-05 - StripEntities now leaves a space behind when it
* removes a
*
* | etc tag.
* @since 2002-07-14
*/
public class EntityStripper {
// ------------------------------ CONSTANTS ------------------------------
/**
* true to enable the testing code.
*/
private static final boolean DEBUGGING = true;
/**
* unicode nbsp control char, 160, 0x0a.
*/
@SuppressWarnings({ "WeakerAccess" })
public static final char UNICODE_NBSP_160_0x0a = 160;
/**
* Longest an entity can be {@value #LONGEST_ENTITY}, at least in our
* tables, including the lead & and trail ;.
*
*/
public static final int LONGEST_ENTITY = 10;/* ϑ */
/**
* The shortest an entity can be {@value #SHORTEST_ENTITY}, at least in our
* tables, including the lead & and trailing ;.
*
*/
public static final int SHORTEST_ENTITY = 4;/* < */
// ------------------------------ FIELDS ------------------------------
/**
* allows lookup by entity name, to get the corresponding char. Loaded from
* two hard-coded generated arrays burning into this class.
*/
private static final HashMap entityToChar;
/**
* tags, that when removed should leave a space behind.
*/
private static String[] spacingTags = { "tr", "td", "th", "p", "br", "dl",
"dt", "li" };
// -------------------------- PUBLIC STATIC METHODS
// --------------------------
static {
// build HashMap to look up entity name to get corresponding Unicode
// char number. Following code generated by Entities.
String[] entityKeys = {
// W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _
// G E N E
// R A T E D _ C O D E
// generated by Entities. Insert from
// com\mindprod\entities\entitiesjustkeys.javafrag
"quot"/* 34 : " quotation mark */, "amp"/*
* 38 : &
* ampersand
*/, "lt"/*
* 60 :
* <
* less
* -than
* sign
*/,
"gt"/* 62 : > greater-than sign */, "nbsp"/*
* 160 :
* non-breaking
* space
*/, "iexcl"/*
* 161
* :
* &
* #xa1
* ;
* inverted
* exclamation
* mark
*/,
"cent"/* 162 : ¢ cent sign */, "pound"/*
* 163 : £ pound
* sign
*/, "curren"/*
* 164 :
* &
* #xa4;
* currency
* sign
*/,
"yen"/* 165 : ¥ yen sign */, "brvbar"/*
* 166 : ¦ broken
* bar
*/, "sect"/*
* 167 :
* §
* section
* sign
*/,
"uml"/* 168 : ¨ diaeresis */, "copy"/*
* 169 : ©
* copyright sign
*/, "ordf"/*
* 170 :
* ª
* feminine
* ordinal
* indicator
*/,
"laquo"/* 171 : « left guillemot */, "not"/*
* 172 : ¬
* not sign
*/, "shy"/*
* 173
* :
*
* ;
* soft
* hyphen
*/,
"reg"/* 174 : ® registered sign */, "macr"/*
* 175 : ¯
* macron
*/, "deg"/*
* 176
* :
* °
* ;
* degree
* sign
*/,
"plusmn"/* 177 : ± plus-minus sign */, "sup2"/*
* 178 :
* ²
* superscript
* two
*/,
"sup3"/* 179 : ³ superscript three */, "acute"/*
* 180 :
* ´
* acute
* accent
*/,
"micro"/* 181 : µ micro sign */, "para"/*
* 182 : ¶
* pilcrow sign
*/, "middot"/*
* 183
* :
* ·
* ;
* middle
* dot
*/,
"cedil"/* 184 : ¸ cedilla */, "sup1"/*
* 185 : ¹
* superscript one
*/, "ordm"/*
* 186 :
* º
* masculine
* ordinal
* indicator
*/,
"raquo"/* 187 : » right guillemot */, "frac14"/*
* 188 :
* ¼
* vulgar
* fraction
* 1/4
*/,
"frac12"/* 189 : ½ vulgar fraction 1/2 */, "frac34"/*
* 190 :
* &
* #xbe;
* vulgar
* fraction
* 3/4
*/,
"iquest"/* 191 : ¿ inverted question mark */, "Agrave"/*
* 192
* :
* &
* #xc0
* ;
* Latin
* capital
* letter
* A
* with
* grave
*/,
"Aacute"/*
* 193 : Á Latin capital letter A with acute
*/, "Acirc"/*
* 194 : Â Latin capital letter A with
* circumflex
*/, "Atilde"/*
* 195 : Ã Latin capital
* letter A with tilde
*/, "Auml"/*
* 196 : Ä
* Latin capital
* letter A with
* diaeresis
*/, "Aring"/*
* 197 :
* &
* #xc5;
* Latin
* capital
* letter
* A
* with
* ring
* above
*/,
"AElig"/* 198 : Æ Latin capital letter AE */, "Ccedil"/*
* 199
* :
* &
* #xc7
* ;
* Latin
* capital
* letter
* C
* with
* cedilla
*/,
"Egrave"/*
* 200 : È Latin capital letter E with grave
*/, "Eacute"/*
* 201 : É Latin capital letter E with
* acute
*/, "Ecirc"/*
* 202 : Ê Latin capital
* letter E with circumflex
*/, "Euml"/*
* 203 : Ë
* Latin capital
* letter E with
* diaeresis
*/, "Igrave"/*
* 204
* :
* Ì
* ;
* Latin
* capital
* letter
* I
* with
* grave
*/,
"Iacute"/*
* 205 : Í Latin capital letter I with acute
*/, "Icirc"/*
* 206 : Î Latin capital letter I with
* circumflex
*/, "Iuml"/*
* 207 : Ï Latin capital
* letter I with diaeresis
*/, "ETH"/*
* 208 : Ð Latin
* capital letter eth
*/, "Ntilde"/*
* 209 :
* Ñ
* Latin
* capital
* letter
* N with
* tilde
*/,
"Ograve"/*
* 210 : Ò Latin capital letter O with grave
*/, "Oacute"/*
* 211 : Ó Latin capital letter O with
* acute
*/, "Ocirc"/*
* 212 : Ô Latin capital
* letter O with circumflex
*/, "Otilde"/*
* 213 : Õ
* Latin capital
* letter O with
* tilde
*/, "Ouml"/*
* 214
* :
* Ö
* ;
* Latin
* capital
* letter
* O
* with
* diaeresis
*/,
"times"/* 215 : × multiplication sign */, "Oslash"/*
* 216 :
* Ø
* Latin
* capital
* letter
* O with
* stroke
*/,
"Ugrave"/*
* 217 : Ù Latin capital letter U with grave
*/, "Uacute"/*
* 218 : Ú Latin capital letter U with
* acute
*/, "Ucirc"/*
* 219 : Û Latin capital
* letter U with circumflex
*/, "Uuml"/*
* 220 : Ü
* Latin capital
* letter U with
* diaeresis
*/, "Yacute"/*
* 221
* :
* Ý
* ;
* Latin
* capital
* letter
* Y
* with
* acute
*/,
"THORN"/* 222 : Þ Latin capital letter Thorn */,
"szlig"/* 223 : ß Latin small letter sharp s */,
"agrave"/* 224 : à Latin small letter a with grave */,
"aacute"/* 225 : á Latin small letter a with acute */,
"acirc"/*
* 226 : â Latin small letter a with circumflex
*/, "atilde"/*
* 227 : ã Latin small letter a with
* tilde
*/, "auml"/*
* 228 : ä Latin small
* letter a with diaeresis
*/, "aring"/*
* 229 : å Latin
* small letter a
* with ring above
*/, "aelig"/*
* 230 :
* æ
* Latin
* lowercase
* ligature
* ae
*/,
"ccedil"/*
* 231 : ç Latin small letter c with cedilla
*/, "egrave"/*
* 232 : è Latin small letter e with
* grave
*/, "eacute"/*
* 233 : é Latin small
* letter e with acute
*/, "ecirc"/*
* 234 : ê
* Latin small
* letter e with
* circumflex
*/, "euml"/*
* 235
* :
* ë
* ;
* Latin
* small
* letter
* e
* with
* diaeresis
*/,
"igrave"/* 236 : ì Latin small letter i with grave */,
"iacute"/* 237 : í Latin small letter i with acute */,
"icirc"/*
* 238 : î Latin small letter i with circumflex
*/, "iuml"/*
* 239 : ï Latin small letter i with
* diaeresis
*/, "eth"/*
* 240 : ð Latin small letter
* eth
*/, "ntilde"/*
* 241 : ñ Latin
* small letter n with
* tilde
*/, "ograve"/*
* 242 :
* ò
* Latin
* small
* letter
* o with
* grave
*/,
"oacute"/* 243 : ó Latin small letter o with acute */,
"ocirc"/*
* 244 : ô Latin small letter o with circumflex
*/, "otilde"/*
* 245 : õ Latin small letter o with
* tilde
*/, "ouml"/*
* 246 : ö Latin small
* letter o with diaeresis
*/, "divide"/*
* 247 : ÷
* division sign
*/, "oslash"/*
* 248
* :
* ø
* ;
* Latin
* small
* letter
* o
* with
* stroke
*/,
"ugrave"/* 249 : ù Latin small letter u with grave */,
"uacute"/* 250 : ú Latin small letter u with acute */,
"ucirc"/*
* 251 : û Latin small letter u with circumflex
*/, "uuml"/*
* 252 : ü Latin small letter u with
* diaeresis
*/, "yacute"/*
* 253 : ý Latin small
* letter y with acute
*/, "thorn"/*
* 254 : þ Latin
* small letter thorn
*/, "yuml"/*
* 255 :
* ÿ
* Latin
* small
* letter
* y with
* diaeresis
*/,
"OElig"/* 338 : Œ Latin capital ligature oe */,
"oelig"/* 339 : œ Latin small ligature oe */,
"Scaron"/*
* 352 : Š Latin capital letter S with caron
*/, "scaron"/*
* 353 : š Latin small letter s with
* caron
*/, "Yuml"/*
* 376 : Ÿ Latin capital
* letter Y with diaeresis
*/, "fnof"/*
* 402 : ƒ
* Latin small letter
* f with hook
*/, "circ"/*
* 710 :
* ˆ
* modifier
* letter
* circumflex
* accent
*/,
"tilde"/* 732 : ˜ small tilde */, "Alpha"/*
* 913 : Α
* Greek capital
* letter Alpha
*/, "Beta"/*
* 914
* :
*
* x392
* ;
* Greek
* capital
* letter
* Beta
*/,
"Gamma"/*
* 915 : & #x393 ; Greek capital letter Gamma
*/,
"Delta"/* 916 : Δ Greek capital letter Delta */,
"Epsilon"/* 917 : Ε Greek capital letter Epsilon */,
"Zeta"/* 918 : Ζ Greek capital letter Zeta */, "Eta"/*
* 919
* : &
* #
* x397
* ;
* Greek
* capital
* letter
* Eta
*/,
"Theta"/* 920 : Θ Greek capital letter Theta */,
"Iota"/* 921 : Ι Greek capital letter Iota */,
"Kappa"/*
* 922 : & #x39a ; Greek capital letter Kappa
*/,
"Lambda"/* 923 : Λ Greek capital letter Lambda */,
"Mu"/* 924 : Μ Greek capital letter Mu */, "Nu"/*
* 925 :
* Ν
* Greek
* capital
* letter
* Nu
*/,
"Xi"/* 926 : Ξ Greek capital letter Xi */, "Omicron"/*
* 927
* : &
* #
* x39f
* ;
* Greek
* capital
* letter
* Omicron
*/,
"Pi"/* 928 : Π Greek capital letter Pi */, "Rho"/*
* 929 :
* Ρ
* Greek
* capital
* letter
* Rho
*/,
"Sigma"/* 931 : Σ Greek capital letter Sigma */,
"Tau"/* 932 : Τ Greek capital letter Tau */,
"Upsilon"/* 933 : Υ Greek capital letter Upsilon */,
"Phi"/* 934 : Φ Greek capital letter Phi */, "Chi"/*
* 935 :
* &
* #x3a7
* ;
* Greek
* capital
* letter
* Chi
*/,
"Psi"/* 936 : Ψ Greek capital letter Psi */, "Omega"/*
* 937
* : &
* #
* x3a9
* ;
* Greek
* capital
* letter
* Omega
*/,
"alpha"/* 945 : α Greek small letter alpha */, "beta"/*
* 946
* :
* &
* #x3b2
* ;
* Greek
* small
* letter
* beta
*/,
"gamma"/*
* 947 : & #x3b3 ; Greek small letter gamma
*/,
"delta"/* 948 : δ Greek small letter delta */,
"epsilon"/* 949 : ε Greek small letter epsilon */,
"zeta"/* 950 : ζ Greek small letter zeta */, "eta"/*
* 951 :
* &
* #x3b7
* ;
* Greek
* small
* letter
* eta
*/,
"theta"/* 952 : θ Greek small letter theta */, "iota"/*
* 953
* :
* &
* #x3b9
* ;
* Greek
* small
* letter
* iota
*/,
"kappa"/*
* 954 : & #x3ba ; Greek small letter kappa
*/,
"lambda"/* 955 : λ Greek small letter lambda */, "mu"/*
* 956
* :
* &
* #x3bc
* ;
* Greek
* small
* letter
* mu
*/,
"nu"/*
* 957 : ν Greek small letter nu
*/, "xi"/* 958 : ξ Greek small letter xi */,
"omicron"/*
* 959 : & #x3bf ; Greek small letter omicron
*/, "pi"/* 960 : π Greek small letter pi */,
"rho"/*
* 961 : ρ Greek small letter rho
*/, "sigmaf"/*
* 962 : ς Greek small letter final
* sigma
*/, "sigma"/*
* 963 : σ Greek small
* letter sigma
*/, "tau"/*
* 964 : τ Greek
* small letter tau
*/, "upsilon"/*
* 965 : &
* #x3c5 ;
* Greek
* small
* letter
* upsilon
*/,
"phi"/* 966 : φ Greek small letter phi */, "chi"/*
* 967 :
* χ
* Greek
* small
* letter
* chi
*/,
"psi"/* 968 : ψ Greek small letter psi */, "omega"/*
* 969 :
* &
* #x3c9
* ;
* Greek
* small
* letter
* omega
*/,
"thetasym"/* 977 : ϑ Greek theta symbol */, "upsih"/*
* 978
* : &
* #
* x3d2
* ;
* Greek
* upsilon
* with
* hook
* symbol
*/,
"piv"/* 982 : ϖ Greek pi symbol */, "ensp"/*
* 8194 :
* en
* space
*/, "emsp"/*
* 8195
* :
* &
* #x2003
* ;
* em
* space
*/,
"thinsp"/*
* 8201 : thin space
*/, "zwnj"/*
* 8204 : ; zero width non -joiner
*/, "zwj"/*
* 8205 : zero width
* joiner
*/, "lrm"/*
* 8206 :
* left-to-right mark
*/, "rlm"/*
* 8207 :
* ;
* right -
* to-left mark
*/,
"ndash"/* 8211 : – en dash */, "mdash"/*
* 8212 : —
* em dash
*/, "lsquo"/*
* 8216
* :
* ‘
* ;
* left
* single
* -6
* quotation
* mark
*/,
"rsquo"/* 8217 : ’ right single-9 quotation mark */,
"sbquo"/* 8218 : ‚ single low-9 quotation mark */,
"ldquo"/* 8220 : “ left double-66 quotation mark */,
"rdquo"/* 8221 : ” right double-99 quotation mark */,
"bdquo"/* 8222 : „ double low-99 quotation mark */,
"dagger"/* 8224 : † dagger */, "Dagger"/*
* 8225 : ‡
* double dagger
*/, "bull"/*
* 8226
* :
* •
* ;
* bullet
*/,
"hellip"/* 8230 : … horizontal ellipsis */, "permil"/*
* 8240
* :
* &
* #x2030
* ;
* per
* mille
* sign
*/,
"prime"/*
* 8242 : ′ prime
*/, "Prime"/* 8243 : ″ double prime */,
"lsaquo"/*
* 8249 : ‹ single left -pointing angle quotation
* mark
*/, "rsaquo"/*
* 8250 : › single right-pointing
* angle quotation mark
*/,
"oline"/* 8254 : ‾ overline */, "frasl"/*
* 8260 : ⁄
* fraction slash
*/, "euro"/*
* 8364
* :
* €
* ;
* Euro
* currency
* sign
*/,
"image"/* 8465 : ℑ black-letter capital i */,
"weierp"/* 8472 : ℘ script capital p */, "real"/*
* 8476 :
* &
* #x211c;
* black
* -letter
* capital
* r
*/,
"trade"/* 8482 : ™ trademark sign */, "alefsym"/*
* 8501 :
* ℵ
* ; alef
* symbol
*/,
"larr"/* 8592 : ← leftwards arrow */, "uarr"/*
* 8593 :
* ↑
* upwards
* arrow
*/,
"rarr"/* 8594 : → rightwards arrow */, "darr"/*
* 8595 :
* ↓
* downwards
* arrow
*/,
"harr"/* 8596 : ↔ left right arrow */, "crarr"/*
* 8629 :
* ↵
* downwards
* arrow
* with
* corner
* leftwards
*/,
"lArr"/* 8656 : ⇐ leftwards double arrow */, "uArr"/*
* 8657
* :
*
* x21d1
* ;
* upwards
* double
* arrow
*/,
"rArr"/* 8658 : ⇒ rightwards double arrow */, "dArr"/*
* 8659
* :
* &
* #x21d3
* ;
* downwards
* double
* arrow
*/,
"hArr"/* 8660 : ⇔ left right double arrow */,
"forall"/* 8704 : ∀ for all */, "part"/*
* 8706 : ∂
* partial
* differential
*/, "exist"/*
* 8707
* :
* ∃
* ;
* there
* exists
*/,
"empty"/* 8709 : ∅ empty set */, "nabla"/*
* 8711 :
* ∇ nabla
*/, "isin"/*
* 8712
* :
*
* x2208
* ;
* element
* of
*/,
"notin"/*
* 8713 : ∉ not an element of
*/, "ni"/*
* 8715 : ∋ contains as member
*/, "prod"/* 8719 : ∏ n-ary product */,
"sum"/*
* 8721 : ∑ n-ary summation
*/, "minus"/*
* 8722 : − ; minus sign
*/, "lowast"/*
* 8727 : ∗ asterisk
* operator
*/, "radic"/*
* 8730 : √ ;
* square root
*/, "prop"/*
* 8733 :
* ∝
* proportional
* to
*/,
"infin"/*
* 8734 : ∞ infinity
*/, "ang"/*
* 8736 : ∠ ; angle
*/, "and"/* 8743 : ∧ logical and */,
"or"/*
* 8744 : ∨ logical or
*/, "cap"/*
* 8745 : ∩ intersection
*/, "cup"/* 8746 : ∪ union */, "int"/*
* 8747
* :
*
* x222b
* ;
* integral
*/,
"there4"/*
* 8756 : ∴ therefore
*/, "sim"/*
* 8764 : ∼ tilde operator
*/,
"cong"/* 8773 : ≅ congruent to */, "asymp"/*
* 8776 :
* ≈
* asymptotic
* to
*/, "ne"/*
* 8800
* :
*
* x2260
* ;
* not
* equal
* to
*/,
"equiv"/* 8801 : ≡ identical to */, "le"/*
* 8804 :
* ≤
* less-than or
* equal to
*/, "ge"/*
* 8805
* :
* ≥
* ;
* greater
* -than
* or
* equal
* to
*/,
"sub"/* 8834 : ⊂ subset of */, "sup"/*
* 8835 : ⊃
* superset of
*/, "nsub"/*
* 8836 :
* &
* #x2284;
* not a
* subset
* of
*/,
"sube"/* 8838 : ⊆ subset of or equal to */, "supe"/*
* 8839
* :
* ⊇
* ;
* superset
* of
* or
* equal
* to
*/,
"oplus"/* 8853 : ⊕ circled plus */, "otimes"/*
* 8855 :
* ⊗
* circled
* times
*/,
"perp"/* 8869 : ⊥ up tack */, "sdot"/*
* 8901 : ⋅
* dot operator
*/, "lceil"/*
* 8968 :
* &
* #x2308
* ; left
* ceiling
*/,
"rceil"/* 8969 : ⌉ right ceiling */, "lfloor"/*
* 8970 :
* ⌊
* left
* floor
*/,
"rfloor"/* 8971 : ⌋ right floor */, "lang"/*
* 9001 :
* 〈
* left
* -pointing
* angle
* bracket
*/,
"rang"/* 9002 : 〉 right-pointing angle bracket */,
"loz"/* 9674 : ◊ lozenge */, "spades"/*
* 9824 : ♠
* black spade suit
*/, "clubs"/*
* 9827
* :
* ♣
* ;
* black
* club
* suit
*/,
"hearts"/* 9829 : ♥ black heart suit */, "diams"/*
* 9830 :
* &
* #x2666
* ;
* black
* diamond
* suit
*/, };
char[] entityValues = {
// W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _
// G E N E
// R A T E D _ C O D E
// generated by Entities. Insert from
// com\mindprod\entities\entitiesjustkeys.javafrag
34/* " : " quotation mark */, 38/*
* & : &
* ampersand
*/, 60/*
* < :
* <
* less-than
* sign
*/, 62/*
* >
* :
* >
* ;
* greater
* -than
* sign
*/,
160/* : non-breaking space */, 161/*
* ¡ :
* ¡
* inverted
* exclamation
* mark
*/, 162/*
* ¢
* :
* ¢
* ;
* cent
* sign
*/,
163/* £ : £ pound sign */, 164/*
* ¤ : ¤
* currency sign
*/, 165/*
* ¥ :
* ¥ yen
* sign
*/, 166/*
* ¦
* ; :
* &
* #xa6
* ;
* broken
* bar
*/,
167/* § : § section sign */, 168/*
* ¨ : ¨
* diaeresis
*/, 169/*
* © :
* ©
* copyright
* sign
*/, 170/*
* ª
* ; :
* &
* #xaa
* ;
* feminine
* ordinal
* indicator
*/,
171/* « : « left guillemot */, 172/*
* ¬ : ¬
* not sign
*/, 173/*
* :
*
* soft
* hyphen
*/,
174/* ® : ® registered sign */, 175/*
* ¯ : ¯
* macron
*/, 176/*
* ° :
* °
* degree
* sign
*/,
177/* ± : ± plus-minus sign */, 178/*
* ² :
* ²
* superscript
* two
*/, 179/*
* ³
* :
* ³
* superscript
* three
*/,
180/* ´ : ´ acute accent */, 181/*
* µ : µ
* micro sign
*/, 182/*
* ¶ :
* ¶
* pilcrow
* sign
*/, 183/*
* &
* middot
* ;
* :
* &
* #xb7
* ;
* middle
* dot
*/,
184/* ¸ : ¸ cedilla */, 185/*
* ¹ : ¹
* superscript one
*/, 186/*
* º : º
* masculine
* ordinal
* indicator
*/, 187/*
* »
* :
* »
* right
* guillemot
*/,
188/* ¼ : ¼ vulgar fraction 1/4 */, 189/*
* ½ :
* ½
* vulgar
* fraction
* 1/2
*/, 190/*
* &
* frac34
* ;
* :
* &
* #xbe
* ;
* vulgar
* fraction
* 3
* /4
*/,
191/* ¿ : ¿ inverted question mark */, 192/*
* À
* :
* À
* Latin
* capital
* letter
* A with
* grave
*/,
193/*
* Á : Á Latin capital letter A with acute
*/, 194/*
* Â : Â Latin capital letter A with
* circumflex
*/, 195/*
* Ã : Ã Latin capital letter A
* with tilde
*/, 196/*
* Ä : Ä Latin capital
* letter A with diaeresis
*/, 197/*
* Å : Å Latin
* capital letter A with ring
* above
*/, 198/*
* Æ : Æ
* Latin capital
* letter AE
*/, 199/*
* Ç :
* Ç
* Latin
* capital
* letter C
* with
* cedilla
*/, 200/*
* &
* Egrave
* ;
* :
* &
* #xc8
* ;
* Latin
* capital
* letter
* E
* with
* grave
*/,
201/*
* É : É Latin capital letter E with acute
*/, 202/*
* Ê : Ê Latin capital letter E with
* circumflex
*/, 203/*
* Ë : Ë Latin capital letter E
* with diaeresis
*/, 204/*
* Ì : Ì Latin capital
* letter I with grave
*/, 205/*
* Í : Í Latin
* capital letter I with
* acute
*/, 206/*
* Î : Î
* Latin capital
* letter I with
* circumflex
*/, 207/*
* Ï :
* Ï
* Latin
* capital
* letter I
* with
* diaeresis
*/, 208/*
* Ð
* ;
* :
* &
* #xd0
* ;
* Latin
* capital
* letter
* eth
*/,
209/*
* Ñ : Ñ Latin capital letter N with tilde
*/, 210/*
* Ò : Ò Latin capital letter O with
* grave
*/, 211/*
* Ó : Ó Latin capital letter O
* with acute
*/, 212/*
* Ô : Ô Latin capital
* letter O with circumflex
*/, 213/*
* Õ : Õ Latin
* capital letter O with
* tilde
*/, 214/*
* Ö : Ö
* Latin capital
* letter O with
* diaeresis
*/, 215/*
* × :
* ×
* multiplication
* sign
*/, 216/*
* &
* Oslash
* ;
* :
* &
* #xd8
* ;
* Latin
* capital
* letter
* O
* with
* stroke
*/,
217/*
* Ù : Ù Latin capital letter U with grave
*/, 218/*
* Ú : Ú Latin capital letter U with
* acute
*/, 219/*
* Û : Û Latin capital letter U
* with circumflex
*/, 220/*
* Ü : Ü Latin capital
* letter U with diaeresis
*/, 221/*
* Ý : Ý Latin
* capital letter Y with
* acute
*/, 222/*
* Þ : Þ
* Latin capital
* letter Thorn
*/, 223/*
* ß :
* ß
* Latin
* small
* letter
* sharp s
*/, 224/*
* &
* agrave
* ;
* :
* &
* #xe0
* ;
* Latin
* small
* letter
* a
* with
* grave
*/,
225/* á : á Latin small letter a with acute */,
226/*
* â : â Latin small letter a with circumflex
*/,
227/* ã : ã Latin small letter a with tilde */,
228/*
* ä : ä Latin small letter a with diaeresis
*/, 229/*
* å : å Latin small letter a with ring
* above
*/, 230/*
* æ : æ Latin lowercase ligature
* ae
*/, 231/*
* ç : ç Latin small
* letter c with cedilla
*/, 232/*
* è : è Latin
* small letter e with grave
*/, 233/*
* é : é
* Latin small letter
* e with acute
*/, 234/*
* ê :
* ê
* Latin
* small
* letter e
* with
* circumflex
*/, 235/*
* ë
* ;
* :
* &
* #xeb
* ;
* Latin
* small
* letter
* e
* with
* diaeresis
*/,
236/* ì : ì Latin small letter i with grave */,
237/* í : í Latin small letter i with acute */,
238/*
* î : î Latin small letter i with circumflex
*/, 239/*
* ï : ï Latin small letter i with
* diaeresis
*/, 240/* ð : ð Latin small letter eth */,
241/* ñ : ñ Latin small letter n with tilde */,
242/* ò : ò Latin small letter o with grave */,
243/* ó : ó Latin small letter o with acute */,
244/*
* ô : ô Latin small letter o with circumflex
*/, 245/*
* õ : õ Latin small letter o with tilde
*/, 246/*
* ö : ö Latin small letter o with
* diaeresis
*/, 247/*
* ÷ : ÷ division sign
*/, 248/*
* ø : ø Latin
* small letter o with stroke
*/, 249/*
* ù : ù
* Latin small letter
* u with grave
*/, 250/*
* ú ;
* : & #xfa ;
* Latin
* small
* letter u
* with acute
*/, 251/*
* û
* ;
* :
* &
* #xfb
* ;
* Latin
* small
* letter
* u
* with
* circumflex
*/,
252/*
* ü : ü Latin small letter u with diaeresis
*/, 253/*
* ý : ý Latin small letter y with acute
*/, 254/*
* þ : þ Latin small letter thorn
*/, 255/*
* ÿ : ÿ Latin small letter
* y with diaeresis
*/, 338/*
* Œ : Œ Latin
* capital ligature oe
*/, 339/*
* &oelig ; : & #x153
* ; Latin small
* ligature oe
*/, 352/*
* Š :
* Š
* Latin
* capital
* letter S
* with caron
*/, 353/*
* &
* scaron
* ;
* :
* &
* #x161
* ;
* Latin
* small
* letter
* s
* with
* caron
*/,
376/*
* Ÿ : Ÿ Latin capital letter Y with diaeresis
*/, 402/*
* ƒ : ƒ Latin small letter f with hook
*/, 710/*
* ˆ : ˆ modifier letter
* circumflex accent
*/, 732/*
* ˜ : ˜ small tilde
*/, 913/*
* &Alpha ; : & #x391 ; Greek
* capital letter Alpha
*/, 914/*
* Β : Β
* Greek capital
* letter Beta
*/, 915/*
* Γ :
* Γ
* Greek
* capital
* letter
* Gamma
*/, 916/*
* &Delta
* ;
* :
* &
* #x394
* ;
* Greek
* capital
* letter
* Delta
*/,
917/*
* Ε : Ε Greek capital letter Epsilon
*/, 918/*
* Ζ : Ζ Greek capital letter Zeta
*/, 919/*
* Η : Η ; Greek capital letter Eta
*/, 920/*
* &Theta ; : & #x398 ; Greek capital
* letter Theta
*/, 921/*
* Ι : Ι Greek
* capital letter Iota
*/, 922/*
* &Kappa ; :
* x39a; Greek
* capital letter
* Kappa
*/, 923/*
* Λ :
* Λ
* Greek
* capital
* letter
* Lambda
*/, 924/*
* &Mu
* ;
* :
* &
* #x39c
* ;
* Greek
* capital
* letter
* Mu
*/,
925/*
* Ν : Ν Greek capital letter Nu
*/, 926/*
* Ξ : Ξ Greek capital letter Xi
*/, 927/*
* Ο : Ο Greek capital letter
* Omicron
*/, 928/*
* &Pi ; : & #x3a0 ; Greek capital
* letter Pi
*/, 929/*
* Ρ : Ρ Greek
* capital letter Rho
*/, 931/*
* Σ : Σ
* Greek capital
* letter Sigma
*/, 932/*
* Τ :
* Τ
* Greek
* capital
* letter Tau
*/, 933/*
* &
* Upsilon
* ;
* :
*
* x3a5
* ;
* Greek
* capital
* letter
* Upsilon
*/,
934/*
* Φ : Φ Greek capital letter Phi
*/, 935/*
* Χ : Χ ; Greek capital letter Chi
*/, 936/*
* &Psi ; : & #x3a8 ; Greek capital letter
* Psi
*/, 937/*
* &Omega ; : x3a9; Greek capital
* letter Omega
*/, 945/*
* α : α Greek
* small letter alpha
*/, 946/*
* β : β ;
* Greek small letter
* beta
*/, 947/*
* γ :
* γ
* Greek
* small
* letter
* gamma
*/, 948/*
* &delta
* ;
* :
*
* x3b4
* ;
* Greek
* small
* letter
* delta
*/,
949/*
* ε : ε Greek small letter epsilon
*/, 950/*
* ζ : ζ Greek small letter zeta
*/, 951/*
* η : η ; Greek small letter eta
*/, 952/*
* θ : θ Greek small
* letter theta
*/, 953/*
* ι : ι ; Greek
* small letter iota
*/, 954/*
* &kappa ; : & #x3ba
* ; Greek small
* letter kappa
*/, 955/*
* &lambda ;
* : x3bb;
* Greek
* small
* letter
* lambda
*/, 956/*
* &mu
* ;
* :
* &
* #x3bc
* ;
* Greek
* small
* letter
* mu
*/,
957/*
* ν : ν Greek small letter nu
*/, 958/* ξ : ξ Greek small letter xi */, 959/*
* &
* omicron
* ; :
* &
* #x3bf
* ;
* Greek
* small
* letter
* omicron
*/,
960/* π : π Greek small letter pi */, 961/*
* ρ :
* ρ
* Greek small
* letter rho
*/, 962/*
* &sigmaf
* ; :
* &
* #x3c2
* ;
* Greek
* small
* letter
* final
* sigma
*/,
963/*
* σ : σ Greek small letter sigma
*/, 964/*
* τ : τ ; Greek small letter tau
*/, 965/*
* υ : υ Greek small letter
* upsilon
*/, 966/*
* φ : φ Greek small letter
* phi
*/, 967/*
* &chi ; : & #x3c7 ; Greek
* small letter chi
*/, 968/*
* ψ : ψ
* Greek small letter
* psi
*/, 969/*
* ω :
* ω
* Greek
* small
* letter
* omega
*/, 977/*
* &
* thetasym
* ;
* :
* &
* #x3d1
* ;
* Greek
* theta
* symbol
*/,
978/*
* ϒ : ϒ Greek upsilon with hook symbol
*/, 982/* ϖ : ϖ Greek pi symbol */, 8194/*
*
* :
*
* ; en
* space
*/,
8195/*
* : ; em space
*/, 8201/* : thin space */, 8204/*
*
* :
*
* ; zero
* width
* non
* -joiner
*/,
8205/*
* : zero width joiner
*/, 8206/* : left-to-right mark */,
8207/*
* : right -to-left mark
*/, 8211/* – : – en dash */, 8212/*
* &mdash ; :
* & #x2014 ;
* em dash
*/,
8216/*
* ‘ : ‘ left single-6 quotation mark
*/, 8217/*
* &rsquo ; : x2019 ; right single -9 quotation
* mark
*/, 8218/*
* ‚ : ‚ single low-9
* quotation mark
*/, 8220/*
* “ : “ left
* double-66 quotation mark
*/, 8221/*
* ” : ”
* right double-99
* quotation mark
*/, 8222/*
* &bdquo ; : &
* #x201e ;
* double low
* -99
* quotation
* mark
*/, 8224/*
* &dagger
* ; :
* &
* #x2020
* ;
* dagger
*/,
8225/*
* ‡ : ‡ double dagger
*/, 8226/*
* &bull ; : & #x2022 ; bullet
*/, 8230/*
* &hellip ; : x2026 ; horizontal
* ellipsis
*/, 8240/*
* ‰ : ‰ per mille
* sign
*/, 8242/*
* ′ : ′
* prime
*/, 8243/*
* &Prime ; :
* x2033 ;
* double prime
*/, 8249/*
* &lsaquo
* ; :
* &
* #x2039
* ;
* single
* left
* -
* pointing
* angle
* quotation
* mark
*/,
8250/*
* › : › single right-pointing angle quotation
* mark
*/, 8254/*
* ‾ : ‾ overline
*/, 8260/*
* &frasl ; : & #x2044 ; fraction slash
*/, 8364/*
* € : € Euro
* currency sign
*/, 8465/*
* &image ; : x2111 ;
* black - letter
* capital i
*/, 8472/*
* ℘ :
* ℘
* script
* capital p
*/, 8476/*
* &real
* ; :
* &
* #x211c
* ;
* black
* -
* letter
* capital
* r
*/,
8482/*
* ™ : ™ trademark sign
*/, 8501/*
* &alefsym ; : & #x2135 ; alef symbol
*/, 8592/*
* ← : ← ; leftwards arrow
*/, 8593/*
* ↑ : ↑ upwards
* arrow
*/, 8594/*
* → : →
* rightwards arrow
*/, 8595/*
* ↓ :
* ↓ ;
* downwards
* arrow
*/, 8596/*
* &harr
* ; :
* &
* #x2194
* ;
* left
* right
* arrow
*/,
8629/*
* ↵ : ↵ downwards arrow with corner leftwards
*/, 8656/*
* ⇐ : ⇐ ; leftwards double arrow
*/, 8657/*
* &uArr ; : & #x21d1 ; upwards double
* arrow
*/, 8658/*
* ⇒ : ⇒ rightwards
* double arrow
*/, 8659/*
* ⇓ : ⇓
* downwards double
* arrow
*/, 8660/*
* ⇔ :
* ⇔ ;
* left right
* double arrow
*/, 8704/*
* &forall
* ; :
* &
* #x2200
* ;
* for
* all
*/,
8706/*
* ∂ : ∂ partial differential
*/, 8707/*
* ∃ : ∃ there exists
*/, 8709/* ∅ : ∅ empty set */,
8711/*
* ∇ : ∇ nabla
*/, 8712/*
* ∈ : ∈ element of
*/,
8713/* ∉ : ∉ not an element of */, 8715/*
* ∋ :
* ∋
* contains
* as member
*/,
8719/* ∏ : ∏ n-ary product */, 8721/*
* ∑ :
* ∑ n-ary
* summation
*/, 8722/*
* &minus
* ; :
*
* x2212
* ;
* minus
* sign
*/,
8727/*
* ∗ : ∗ asterisk operator
*/, 8730/*
* √ : √ square root
*/, 8733/*
* ∝ : ∝ proportional to
*/, 8734/*
* &infin ; : & #x221e ; infinity
*/, 8736/*
* ∠ : ∠ ;
* angle
*/, 8743/*
* ∧ :
* ∧
* logical and
*/, 8744/*
* ∨
* :
*
* x2228
* ;
* logical
* or
*/,
8745/*
* ∩ : ∩ intersection
*/, 8746/* ∪ : ∪ union */, 8747/*
* ∫ :
* ∫
* integral
*/, 8756/*
* &there4
* ; :
*
* x2234
* ;
* therefore
*/,
8764/*
* &sim ; : & #x223c ; tilde operator
*/, 8773/*
* ≅ : ≅ ; congruent to
*/, 8776/* ≈ : ≈ asymptotic to */,
8800/*
* ≠ : ≠ not equal to
*/, 8801/*
* &equiv ; : x2261 ; identical to
*/, 8804/*
* ≤ : ≤ less-than or equal to
*/, 8805/*
* ≥ : ≥ greater -than
* or equal to
*/, 8834/*
* ⊂ : ⊂
* subset of
*/, 8835/*
* ⊃ :
* ⊃ ;
* superset of
*/, 8836/*
* &nsub
* ; :
* &
* #x2284
* ;
* not
* a
* subset
* of
*/,
8838/*
* ⊆ : ⊆ ; subset of or equal to
*/, 8839/*
* ⊇ : ⊇ superset of or equal to
*/, 8853/*
* &oplus ; : x2295 ; circled plus
*/, 8855/*
* ⊗ : ⊗ circled
* times
*/, 8869/*
* &perp ; : & #x22a5 ;
* up tack
*/, 8901/*
* ⋅ :
* ⋅ ;
* dot operator
*/, 8968/*
* &lceil
* ; :
* &
* #x2308
* ;
* left
* ceiling
*/,
8969/*
* ⌉ : ⌉ right ceiling
*/, 8970/*
* &lfloor ; : x230a ; left floor
*/, 8971/* ⌋ : ⌋ right floor */,
9001/*
* 〈 : 〈 ; left- pointing angle bracket
*/, 9002/*
* 〉 : 〉 ; right - pointing angle
* bracket
*/, 9674/* ◊ : ◊ lozenge */, 9824/*
* &spades
* ; :
* &
* #x2660
* ;
* black
* spade
* suit
*/,
9827/*
* ♣ : ♣ black club suit
*/, 9829/*
* &hearts ; : x2665 ; black heart suit
*/, 9830/*
* ♦ : ♦ black diamond suit
*/, };
// allow 50% extra space for faster lookup.
entityToChar = new HashMap(
entityKeys.length * 150 / 100);
for (int i = 0; i < entityKeys.length; i++) {
// leave out nbsp so it can be specially handled if entity not
// found.
if (!entityKeys[i].equals("nbsp")) {
entityToChar.put(entityKeys[i], entityValues[i]);
}
// add also ' for strip but not insert. optional for XML, not
// used in HTML.
entityToChar.put("apos", (char) 39);
}
}// end static
/**
* convert an entity to a single char.
*
* @param bareEntity
* String entity to convert convert. must have lead & and trail ;
* stripped; may have form: #x12ff or #123 or lt or nbsp style
* entity. Works faster if entity in lower case.
* @param howToTranslateNbsp
* char you would like   translated to, usually ' ' or (char)
* 160
*
* @return equivalent character. 0 if not recognised.
*/
public static char bareHTMLEntityToChar(String bareEntity,
char howToTranslateNbsp) {
// first check for alpha entity
Character code = entityToChar.get(bareEntity);
if (code != null) {
return code;
}
code = entityToChar.get(bareEntity.toLowerCase());
if (code != null) {
return code;
}
// nbsp is not in hashMap. We test for it specially.
if (bareEntity.length() == 4 && bareEntity.equals("nbsp")
|| bareEntity.equals("NBSP")) {
return howToTranslateNbsp;
}
// check at least have &_#1_; (no & or ; at this point )
if (bareEntity.length() < 2) {
return 0;
}
try {
if (bareEntity.charAt(0) == '#') {
final char secondChar = bareEntity.charAt(1);
if (secondChar == 'x' || secondChar == 'X') {
// handle hex entities of form &_#x12ff_;
// ensure at least have &_#xf_;
if (bareEntity.length() < 3) {
return 0;
}
// had &_#x123D_;
return (char) Integer.parseInt(bareEntity.substring(2),
/* hex */
16);
} else {
// handle decimal entities
// had &_#123_;
return (char) Integer.parseInt(bareEntity.substring(1));
}
} else {
// some unrecognized/malformed bareEntity
return 0;
}
} catch (NumberFormatException e) {
return 0;
}
}// end entityToChar
/**
* strips tags and entities from HTML. Leaves \n \r unchanged.
*
* @param text
* to flatten
* @param translateNbspTo
* char you would like translated to, usually ' ' or
* (char) 160 .
*
* @return flattened text
*/
public static String flattenHTML(String text, char translateNbspTo) {
return stripHTMLEntities(stripHTMLTags(text), translateNbspTo);
}
/**
* strips tags and entities from XML..
*
* @param text
* to flatten
*
* @return flattened text
*/
public static String flattenXML(String text) {
return stripXMLEntities(stripXMLTags(text));
}
/**
* Checks a number of gauntlet conditions to ensure this is a valid entity.
* Converts Entity to corresponding char.
*
* @param possBareEntityWithSemicolon
* string that may hold an entity. Lead & must be stripped, but
* may optionally contain text past the ;
* @param translateNbspTo
* char you would like nbsp translated to, usually ' ' or (char)
* 160 .
*
* @return corresponding unicode character, or 0 if the entity is invalid.
*/
protected static char possBareHTMLEntityWithSemicolonToChar(
String possBareEntityWithSemicolon, char translateNbspTo) {
if (possBareEntityWithSemicolon.length() < SHORTEST_ENTITY - 1) {
return 0;
}
// find the trailing ;
int whereSemi = possBareEntityWithSemicolon.indexOf(';',
SHORTEST_ENTITY - 2/* where start looking */);
if (whereSemi < SHORTEST_ENTITY - 2) {
return 0;
}
return bareHTMLEntityToChar(
possBareEntityWithSemicolon.substring(0, whereSemi),
translateNbspTo);
}
/**
* Checks a number of gauntlet conditions to ensure this is a valid entity.
* Converts Entity to corresponding char.
*
* @param possBareEntityWithSemicolon
* string that may hold an entity. Lead & must be stripped, but
* may optionally contain text past the ;
*
* @return corresponding unicode character, or 0 if the entity is invalid.
* nbsp -> (char) 160
*/
public static char possEntityToChar(String possBareEntityWithSemicolon) {
return possBareHTMLEntityWithSemicolonToChar(
possBareEntityWithSemicolon, UNICODE_NBSP_160_0x0a);
}
/**
* Prepares tags for removal, to ensure they are replaced by a space
*
*
* |
*
* |
* | --> _
* ' ') {
// insert space before <
sb.append(' ');
}
break;
}
}
}
sb.append(c);
prevChar = c;
}
return sb.toString();
}
/**
* Converts HTML to text converting entities such as " back to " and
* < back to < Ordinary text passes unchanged. Also strips decimal and
* hex entities and stray HTML entities.
*
* @param text
* raw text to be processed. Must not be null.
* @param translateNbspTo
* char you would like translated to, usually ' ' or
* (char) 160 .
*
* @return translated text. It also handles HTML 4.0 entities such as
* ♥ { and -> 160. null input returns
* null.
*/
public static String stripHTMLEntities(String text, char translateNbspTo) {
if (text == null) {
return null;
}
if (text.indexOf('&') < 0) {
// are no entities, nothing to do
return text;
}
int originalTextLength = text.length();
StringBuilder sb = new StringBuilder(originalTextLength);
for (int i = 0; i < originalTextLength; i++) {
int whereAmp = text.indexOf('&', i);
if (whereAmp < 0) {
// no more &s, we are done
// append all remaining text
sb.append(text.substring(i));
break;
} else {
// append all text to left of next &
sb.append(text.substring(i, whereAmp));
// avoid reprocessing those chars
i = whereAmp;
// text.charAt(i) is an &
// possEntity has lead & stripped.
String possEntity = text.substring(i + 1,
Math.min(i + LONGEST_ENTITY, text.length()));
char t = possBareHTMLEntityWithSemicolonToChar(possEntity,
translateNbspTo);
if (t != 0) {
// was a good entity, keep its equivalent char.
sb.append(t);
// avoid reprocessing chars forming the entity
int whereSemi = possEntity
.indexOf(";", SHORTEST_ENTITY - 2);
i += whereSemi + 1;
} else {
// treat & just as ordinary character
sb.append('&');
}
}// end else
}// end for
// if result is not shorter, we did not do anything. Saves RAM.
return (sb.length() == originalTextLength) ? text : sb.toString();
}// end stripEntities
// -------------------------- STATIC METHODS --------------------------
/**
* remove all text between <applet.. </applet>, <style...
* </style> <script... </script>
*
* @param s
* HTML string to strip tag pairs out of.
*
* @return string with tag pairs stripped out.
*/
private static String stripHTMLTagPairs(String s) {
String[] tags = { "applet", "APPLET", "style", "STYLE", "script",
"SCRIPT" };
for (final String tag : tags) {
final String beginTag = "<" + tag;
final String endTag = "" + tag + ">";
int begin = 0;
while (begin < s.length()
&& (begin = s.indexOf(beginTag, begin)) >= 0) {
final int end;
if ((end = s.indexOf(endTag, begin + beginTag.length())) > 0) {
// chop out the