/* * @(#)StripEntities.java * * Summary: Strips HTML entities such as " from a string, replacing them by their Unicode equivalents. * * Copyright: (c) 2002-2009 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.5+ * * Created with: IntelliJ IDEA IDE. * * Version History: * 2.6 2009-04-05 - StripEntities now leaves a space behind when it removes a

etc tag. */ package toxi.data.feeds.util; import java.util.HashMap; /** * Strips HTML entities such as " from a string, replacing them by their * Unicode equivalents. * * @author Roedy Green, Canadian Mind Products * @version 2.6 2009-04-05 - StripEntities now leaves a space behind when it * removes a
*

* etc tag. * @since 2002-07-14 */ public class EntityStripper { // ------------------------------ CONSTANTS ------------------------------ /** * true to enable the testing code. */ private static final boolean DEBUGGING = true; /** * unicode nbsp control char, 160, 0x0a. */ @SuppressWarnings({ "WeakerAccess" }) public static final char UNICODE_NBSP_160_0x0a = 160; /** * Longest an entity can be {@value #LONGEST_ENTITY}, at least in our * tables, including the lead & and trail ;. * */ public static final int LONGEST_ENTITY = 10;/* ϑ */ /** * The shortest an entity can be {@value #SHORTEST_ENTITY}, at least in our * tables, including the lead & and trailing ;. * */ public static final int SHORTEST_ENTITY = 4;/*  < */ // ------------------------------ FIELDS ------------------------------ /** * allows lookup by entity name, to get the corresponding char. Loaded from * two hard-coded generated arrays burning into this class. */ private static final HashMap entityToChar; /** * tags, that when removed should leave a space behind. */ private static String[] spacingTags = { "tr", "td", "th", "p", "br", "dl", "dt", "li" }; // -------------------------- PUBLIC STATIC METHODS // -------------------------- static { // build HashMap to look up entity name to get corresponding Unicode // char number. Following code generated by Entities. String[] entityKeys = { // W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ // G E N E // R A T E D _ C O D E // generated by Entities. Insert from // com\mindprod\entities\entitiesjustkeys.javafrag "quot"/* 34 : " quotation mark */, "amp"/* * 38 : & * ampersand */, "lt"/* * 60 : * < * less * -than * sign */, "gt"/* 62 : > greater-than sign */, "nbsp"/* * 160 :   * non-breaking * space */, "iexcl"/* * 161 * : * & * #xa1 * ; * inverted * exclamation * mark */, "cent"/* 162 : ¢ cent sign */, "pound"/* * 163 : £ pound * sign */, "curren"/* * 164 : * & * #xa4; * currency * sign */, "yen"/* 165 : ¥ yen sign */, "brvbar"/* * 166 : ¦ broken * bar */, "sect"/* * 167 : * § * section * sign */, "uml"/* 168 : ¨ diaeresis */, "copy"/* * 169 : © * copyright sign */, "ordf"/* * 170 : * ª * feminine * ordinal * indicator */, "laquo"/* 171 : « left guillemot */, "not"/* * 172 : ¬ * not sign */, "shy"/* * 173 * : * ­ * ; * soft * hyphen */, "reg"/* 174 : ® registered sign */, "macr"/* * 175 : ¯ * macron */, "deg"/* * 176 * : * ° * ; * degree * sign */, "plusmn"/* 177 : ± plus-minus sign */, "sup2"/* * 178 : * ² * superscript * two */, "sup3"/* 179 : ³ superscript three */, "acute"/* * 180 : * ´ * acute * accent */, "micro"/* 181 : µ micro sign */, "para"/* * 182 : ¶ * pilcrow sign */, "middot"/* * 183 * : * · * ; * middle * dot */, "cedil"/* 184 : ¸ cedilla */, "sup1"/* * 185 : ¹ * superscript one */, "ordm"/* * 186 : * º * masculine * ordinal * indicator */, "raquo"/* 187 : » right guillemot */, "frac14"/* * 188 : * ¼ * vulgar * fraction * 1/4 */, "frac12"/* 189 : ½ vulgar fraction 1/2 */, "frac34"/* * 190 : * & * #xbe; * vulgar * fraction * 3/4 */, "iquest"/* 191 : ¿ inverted question mark */, "Agrave"/* * 192 * : * & * #xc0 * ; * Latin * capital * letter * A * with * grave */, "Aacute"/* * 193 : Á Latin capital letter A with acute */, "Acirc"/* * 194 :  Latin capital letter A with * circumflex */, "Atilde"/* * 195 : à Latin capital * letter A with tilde */, "Auml"/* * 196 : Ä * Latin capital * letter A with * diaeresis */, "Aring"/* * 197 : * & * #xc5; * Latin * capital * letter * A * with * ring * above */, "AElig"/* 198 : Æ Latin capital letter AE */, "Ccedil"/* * 199 * : * & * #xc7 * ; * Latin * capital * letter * C * with * cedilla */, "Egrave"/* * 200 : È Latin capital letter E with grave */, "Eacute"/* * 201 : É Latin capital letter E with * acute */, "Ecirc"/* * 202 : Ê Latin capital * letter E with circumflex */, "Euml"/* * 203 : Ë * Latin capital * letter E with * diaeresis */, "Igrave"/* * 204 * : * Ì * ; * Latin * capital * letter * I * with * grave */, "Iacute"/* * 205 : Í Latin capital letter I with acute */, "Icirc"/* * 206 : Î Latin capital letter I with * circumflex */, "Iuml"/* * 207 : Ï Latin capital * letter I with diaeresis */, "ETH"/* * 208 : Ð Latin * capital letter eth */, "Ntilde"/* * 209 : * Ñ * Latin * capital * letter * N with * tilde */, "Ograve"/* * 210 : Ò Latin capital letter O with grave */, "Oacute"/* * 211 : Ó Latin capital letter O with * acute */, "Ocirc"/* * 212 : Ô Latin capital * letter O with circumflex */, "Otilde"/* * 213 : Õ * Latin capital * letter O with * tilde */, "Ouml"/* * 214 * : * Ö * ; * Latin * capital * letter * O * with * diaeresis */, "times"/* 215 : × multiplication sign */, "Oslash"/* * 216 : * Ø * Latin * capital * letter * O with * stroke */, "Ugrave"/* * 217 : Ù Latin capital letter U with grave */, "Uacute"/* * 218 : Ú Latin capital letter U with * acute */, "Ucirc"/* * 219 : Û Latin capital * letter U with circumflex */, "Uuml"/* * 220 : Ü * Latin capital * letter U with * diaeresis */, "Yacute"/* * 221 * : * Ý * ; * Latin * capital * letter * Y * with * acute */, "THORN"/* 222 : Þ Latin capital letter Thorn */, "szlig"/* 223 : ß Latin small letter sharp s */, "agrave"/* 224 : à Latin small letter a with grave */, "aacute"/* 225 : á Latin small letter a with acute */, "acirc"/* * 226 : â Latin small letter a with circumflex */, "atilde"/* * 227 : ã Latin small letter a with * tilde */, "auml"/* * 228 : ä Latin small * letter a with diaeresis */, "aring"/* * 229 : å Latin * small letter a * with ring above */, "aelig"/* * 230 : * æ * Latin * lowercase * ligature * ae */, "ccedil"/* * 231 : ç Latin small letter c with cedilla */, "egrave"/* * 232 : è Latin small letter e with * grave */, "eacute"/* * 233 : é Latin small * letter e with acute */, "ecirc"/* * 234 : ê * Latin small * letter e with * circumflex */, "euml"/* * 235 * : * ë * ; * Latin * small * letter * e * with * diaeresis */, "igrave"/* 236 : ì Latin small letter i with grave */, "iacute"/* 237 : í Latin small letter i with acute */, "icirc"/* * 238 : î Latin small letter i with circumflex */, "iuml"/* * 239 : ï Latin small letter i with * diaeresis */, "eth"/* * 240 : ð Latin small letter * eth */, "ntilde"/* * 241 : ñ Latin * small letter n with * tilde */, "ograve"/* * 242 : * ò * Latin * small * letter * o with * grave */, "oacute"/* 243 : ó Latin small letter o with acute */, "ocirc"/* * 244 : ô Latin small letter o with circumflex */, "otilde"/* * 245 : õ Latin small letter o with * tilde */, "ouml"/* * 246 : ö Latin small * letter o with diaeresis */, "divide"/* * 247 : ÷ * division sign */, "oslash"/* * 248 * : * ø * ; * Latin * small * letter * o * with * stroke */, "ugrave"/* 249 : ù Latin small letter u with grave */, "uacute"/* 250 : ú Latin small letter u with acute */, "ucirc"/* * 251 : û Latin small letter u with circumflex */, "uuml"/* * 252 : ü Latin small letter u with * diaeresis */, "yacute"/* * 253 : ý Latin small * letter y with acute */, "thorn"/* * 254 : þ Latin * small letter thorn */, "yuml"/* * 255 : * ÿ * Latin * small * letter * y with * diaeresis */, "OElig"/* 338 : Œ Latin capital ligature oe */, "oelig"/* 339 : œ Latin small ligature oe */, "Scaron"/* * 352 : Š Latin capital letter S with caron */, "scaron"/* * 353 : š Latin small letter s with * caron */, "Yuml"/* * 376 : Ÿ Latin capital * letter Y with diaeresis */, "fnof"/* * 402 : ƒ * Latin small letter * f with hook */, "circ"/* * 710 : * ˆ * modifier * letter * circumflex * accent */, "tilde"/* 732 : ˜ small tilde */, "Alpha"/* * 913 : Α * Greek capital * letter Alpha */, "Beta"/* * 914 * : * &# * x392 * ; * Greek * capital * letter * Beta */, "Gamma"/* * 915 : & #x393 ; Greek capital letter Gamma */, "Delta"/* 916 : Δ Greek capital letter Delta */, "Epsilon"/* 917 : Ε Greek capital letter Epsilon */, "Zeta"/* 918 : Ζ Greek capital letter Zeta */, "Eta"/* * 919 * : & * # * x397 * ; * Greek * capital * letter * Eta */, "Theta"/* 920 : Θ Greek capital letter Theta */, "Iota"/* 921 : Ι Greek capital letter Iota */, "Kappa"/* * 922 : & #x39a ; Greek capital letter Kappa */, "Lambda"/* 923 : Λ Greek capital letter Lambda */, "Mu"/* 924 : Μ Greek capital letter Mu */, "Nu"/* * 925 : * Ν * Greek * capital * letter * Nu */, "Xi"/* 926 : Ξ Greek capital letter Xi */, "Omicron"/* * 927 * : & * # * x39f * ; * Greek * capital * letter * Omicron */, "Pi"/* 928 : Π Greek capital letter Pi */, "Rho"/* * 929 : * Ρ * Greek * capital * letter * Rho */, "Sigma"/* 931 : Σ Greek capital letter Sigma */, "Tau"/* 932 : Τ Greek capital letter Tau */, "Upsilon"/* 933 : Υ Greek capital letter Upsilon */, "Phi"/* 934 : Φ Greek capital letter Phi */, "Chi"/* * 935 : * & * #x3a7 * ; * Greek * capital * letter * Chi */, "Psi"/* 936 : Ψ Greek capital letter Psi */, "Omega"/* * 937 * : & * # * x3a9 * ; * Greek * capital * letter * Omega */, "alpha"/* 945 : α Greek small letter alpha */, "beta"/* * 946 * : * & * #x3b2 * ; * Greek * small * letter * beta */, "gamma"/* * 947 : & #x3b3 ; Greek small letter gamma */, "delta"/* 948 : δ Greek small letter delta */, "epsilon"/* 949 : ε Greek small letter epsilon */, "zeta"/* 950 : ζ Greek small letter zeta */, "eta"/* * 951 : * & * #x3b7 * ; * Greek * small * letter * eta */, "theta"/* 952 : θ Greek small letter theta */, "iota"/* * 953 * : * & * #x3b9 * ; * Greek * small * letter * iota */, "kappa"/* * 954 : & #x3ba ; Greek small letter kappa */, "lambda"/* 955 : λ Greek small letter lambda */, "mu"/* * 956 * : * & * #x3bc * ; * Greek * small * letter * mu */, "nu"/* * 957 : ν Greek small letter nu */, "xi"/* 958 : ξ Greek small letter xi */, "omicron"/* * 959 : & #x3bf ; Greek small letter omicron */, "pi"/* 960 : π Greek small letter pi */, "rho"/* * 961 : ρ Greek small letter rho */, "sigmaf"/* * 962 : ς Greek small letter final * sigma */, "sigma"/* * 963 : σ Greek small * letter sigma */, "tau"/* * 964 : τ Greek * small letter tau */, "upsilon"/* * 965 : & * #x3c5 ; * Greek * small * letter * upsilon */, "phi"/* 966 : φ Greek small letter phi */, "chi"/* * 967 : * χ * Greek * small * letter * chi */, "psi"/* 968 : ψ Greek small letter psi */, "omega"/* * 969 : * & * #x3c9 * ; * Greek * small * letter * omega */, "thetasym"/* 977 : ϑ Greek theta symbol */, "upsih"/* * 978 * : & * # * x3d2 * ; * Greek * upsilon * with * hook * symbol */, "piv"/* 982 : ϖ Greek pi symbol */, "ensp"/* * 8194 : *   en * space */, "emsp"/* * 8195 * : * & * #x2003 * ; * em * space */, "thinsp"/* * 8201 :   thin space */, "zwnj"/* * 8204 : ‌ ; zero width non -joiner */, "zwj"/* * 8205 : ‍ zero width * joiner */, "lrm"/* * 8206 : ‎ * left-to-right mark */, "rlm"/* * 8207 : * ‏ ; * right - * to-left mark */, "ndash"/* 8211 : – en dash */, "mdash"/* * 8212 : — * em dash */, "lsquo"/* * 8216 * : * ‘ * ; * left * single * -6 * quotation * mark */, "rsquo"/* 8217 : ’ right single-9 quotation mark */, "sbquo"/* 8218 : ‚ single low-9 quotation mark */, "ldquo"/* 8220 : “ left double-66 quotation mark */, "rdquo"/* 8221 : ” right double-99 quotation mark */, "bdquo"/* 8222 : „ double low-99 quotation mark */, "dagger"/* 8224 : † dagger */, "Dagger"/* * 8225 : ‡ * double dagger */, "bull"/* * 8226 * : * • * ; * bullet */, "hellip"/* 8230 : … horizontal ellipsis */, "permil"/* * 8240 * : * & * #x2030 * ; * per * mille * sign */, "prime"/* * 8242 : ′ prime */, "Prime"/* 8243 : ″ double prime */, "lsaquo"/* * 8249 : ‹ single left -pointing angle quotation * mark */, "rsaquo"/* * 8250 : › single right-pointing * angle quotation mark */, "oline"/* 8254 : ‾ overline */, "frasl"/* * 8260 : ⁄ * fraction slash */, "euro"/* * 8364 * : * € * ; * Euro * currency * sign */, "image"/* 8465 : ℑ black-letter capital i */, "weierp"/* 8472 : ℘ script capital p */, "real"/* * 8476 : * & * #x211c; * black * -letter * capital * r */, "trade"/* 8482 : ™ trademark sign */, "alefsym"/* * 8501 : * ℵ * ; alef * symbol */, "larr"/* 8592 : ← leftwards arrow */, "uarr"/* * 8593 : * ↑ * upwards * arrow */, "rarr"/* 8594 : → rightwards arrow */, "darr"/* * 8595 : * ↓ * downwards * arrow */, "harr"/* 8596 : ↔ left right arrow */, "crarr"/* * 8629 : * ↵ * downwards * arrow * with * corner * leftwards */, "lArr"/* 8656 : ⇐ leftwards double arrow */, "uArr"/* * 8657 * : * &# * x21d1 * ; * upwards * double * arrow */, "rArr"/* 8658 : ⇒ rightwards double arrow */, "dArr"/* * 8659 * : * & * #x21d3 * ; * downwards * double * arrow */, "hArr"/* 8660 : ⇔ left right double arrow */, "forall"/* 8704 : ∀ for all */, "part"/* * 8706 : ∂ * partial * differential */, "exist"/* * 8707 * : * ∃ * ; * there * exists */, "empty"/* 8709 : ∅ empty set */, "nabla"/* * 8711 : * ∇ nabla */, "isin"/* * 8712 * : * &# * x2208 * ; * element * of */, "notin"/* * 8713 : ∉ not an element of */, "ni"/* * 8715 : ∋ contains as member */, "prod"/* 8719 : ∏ n-ary product */, "sum"/* * 8721 : ∑ n-ary summation */, "minus"/* * 8722 : − ; minus sign */, "lowast"/* * 8727 : ∗ asterisk * operator */, "radic"/* * 8730 : √ ; * square root */, "prop"/* * 8733 : * ∝ * proportional * to */, "infin"/* * 8734 : ∞ infinity */, "ang"/* * 8736 : ∠ ; angle */, "and"/* 8743 : ∧ logical and */, "or"/* * 8744 : ∨ logical or */, "cap"/* * 8745 : ∩ intersection */, "cup"/* 8746 : ∪ union */, "int"/* * 8747 * : * &# * x222b * ; * integral */, "there4"/* * 8756 : ∴ therefore */, "sim"/* * 8764 : ∼ tilde operator */, "cong"/* 8773 : ≅ congruent to */, "asymp"/* * 8776 : * ≈ * asymptotic * to */, "ne"/* * 8800 * : * &# * x2260 * ; * not * equal * to */, "equiv"/* 8801 : ≡ identical to */, "le"/* * 8804 : * ≤ * less-than or * equal to */, "ge"/* * 8805 * : * ≥ * ; * greater * -than * or * equal * to */, "sub"/* 8834 : ⊂ subset of */, "sup"/* * 8835 : ⊃ * superset of */, "nsub"/* * 8836 : * & * #x2284; * not a * subset * of */, "sube"/* 8838 : ⊆ subset of or equal to */, "supe"/* * 8839 * : * ⊇ * ; * superset * of * or * equal * to */, "oplus"/* 8853 : ⊕ circled plus */, "otimes"/* * 8855 : * ⊗ * circled * times */, "perp"/* 8869 : ⊥ up tack */, "sdot"/* * 8901 : ⋅ * dot operator */, "lceil"/* * 8968 : * & * #x2308 * ; left * ceiling */, "rceil"/* 8969 : ⌉ right ceiling */, "lfloor"/* * 8970 : * ⌊ * left * floor */, "rfloor"/* 8971 : ⌋ right floor */, "lang"/* * 9001 : * 〈 * left * -pointing * angle * bracket */, "rang"/* 9002 : 〉 right-pointing angle bracket */, "loz"/* 9674 : ◊ lozenge */, "spades"/* * 9824 : ♠ * black spade suit */, "clubs"/* * 9827 * : * ♣ * ; * black * club * suit */, "hearts"/* 9829 : ♥ black heart suit */, "diams"/* * 9830 : * & * #x2666 * ; * black * diamond * suit */, }; char[] entityValues = { // W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ // G E N E // R A T E D _ C O D E // generated by Entities. Insert from // com\mindprod\entities\entitiesjustkeys.javafrag 34/* " : " quotation mark */, 38/* * & : & * ampersand */, 60/* * < : * < * less-than * sign */, 62/* * > * : * > * ; * greater * -than * sign */, 160/*   :   non-breaking space */, 161/* * ¡ : * ¡ * inverted * exclamation * mark */, 162/* * ¢ * : * ¢ * ; * cent * sign */, 163/* £ : £ pound sign */, 164/* * ¤ : ¤ * currency sign */, 165/* * ¥ : * ¥ yen * sign */, 166/* * ¦ * ; : * & * #xa6 * ; * broken * bar */, 167/* § : § section sign */, 168/* * ¨ : ¨ * diaeresis */, 169/* * © : * © * copyright * sign */, 170/* * ª * ; : * & * #xaa * ; * feminine * ordinal * indicator */, 171/* « : « left guillemot */, 172/* * ¬ : ¬ * not sign */, 173/* * ­ : * ­ * soft * hyphen */, 174/* ® : ® registered sign */, 175/* * ¯ : ¯ * macron */, 176/* * ° : * ° * degree * sign */, 177/* ± : ± plus-minus sign */, 178/* * ² : * ² * superscript * two */, 179/* * ³ * : * ³ * superscript * three */, 180/* ´ : ´ acute accent */, 181/* * µ : µ * micro sign */, 182/* * ¶ : * ¶ * pilcrow * sign */, 183/* * & * middot * ; * : * & * #xb7 * ; * middle * dot */, 184/* ¸ : ¸ cedilla */, 185/* * ¹ : ¹ * superscript one */, 186/* * º : º * masculine * ordinal * indicator */, 187/* * » * : * » * right * guillemot */, 188/* ¼ : ¼ vulgar fraction 1/4 */, 189/* * ½ : * ½ * vulgar * fraction * 1/2 */, 190/* * & * frac34 * ; * : * & * #xbe * ; * vulgar * fraction * 3 * /4 */, 191/* ¿ : ¿ inverted question mark */, 192/* * À * : * À * Latin * capital * letter * A with * grave */, 193/* * Á : Á Latin capital letter A with acute */, 194/* *  :  Latin capital letter A with * circumflex */, 195/* * à : à Latin capital letter A * with tilde */, 196/* * Ä : Ä Latin capital * letter A with diaeresis */, 197/* * Å : Å Latin * capital letter A with ring * above */, 198/* * Æ : Æ * Latin capital * letter AE */, 199/* * Ç : * Ç * Latin * capital * letter C * with * cedilla */, 200/* * & * Egrave * ; * : * & * #xc8 * ; * Latin * capital * letter * E * with * grave */, 201/* * É : É Latin capital letter E with acute */, 202/* * Ê : Ê Latin capital letter E with * circumflex */, 203/* * Ë : Ë Latin capital letter E * with diaeresis */, 204/* * Ì : Ì Latin capital * letter I with grave */, 205/* * Í : Í Latin * capital letter I with * acute */, 206/* * Î : Î * Latin capital * letter I with * circumflex */, 207/* * Ï : * Ï * Latin * capital * letter I * with * diaeresis */, 208/* * Ð * ; * : * & * #xd0 * ; * Latin * capital * letter * eth */, 209/* * Ñ : Ñ Latin capital letter N with tilde */, 210/* * Ò : Ò Latin capital letter O with * grave */, 211/* * Ó : Ó Latin capital letter O * with acute */, 212/* * Ô : Ô Latin capital * letter O with circumflex */, 213/* * Õ : Õ Latin * capital letter O with * tilde */, 214/* * Ö : Ö * Latin capital * letter O with * diaeresis */, 215/* * × : * × * multiplication * sign */, 216/* * & * Oslash * ; * : * & * #xd8 * ; * Latin * capital * letter * O * with * stroke */, 217/* * Ù : Ù Latin capital letter U with grave */, 218/* * Ú : Ú Latin capital letter U with * acute */, 219/* * Û : Û Latin capital letter U * with circumflex */, 220/* * Ü : Ü Latin capital * letter U with diaeresis */, 221/* * Ý : Ý Latin * capital letter Y with * acute */, 222/* * Þ : Þ * Latin capital * letter Thorn */, 223/* * ß : * ß * Latin * small * letter * sharp s */, 224/* * & * agrave * ; * : * & * #xe0 * ; * Latin * small * letter * a * with * grave */, 225/* á : á Latin small letter a with acute */, 226/* * â : â Latin small letter a with circumflex */, 227/* ã : ã Latin small letter a with tilde */, 228/* * ä : ä Latin small letter a with diaeresis */, 229/* * å : å Latin small letter a with ring * above */, 230/* * æ : æ Latin lowercase ligature * ae */, 231/* * ç : ç Latin small * letter c with cedilla */, 232/* * è : è Latin * small letter e with grave */, 233/* * é : é * Latin small letter * e with acute */, 234/* * ê : * ê * Latin * small * letter e * with * circumflex */, 235/* * ë * ; * : * & * #xeb * ; * Latin * small * letter * e * with * diaeresis */, 236/* ì : ì Latin small letter i with grave */, 237/* í : í Latin small letter i with acute */, 238/* * î : î Latin small letter i with circumflex */, 239/* * ï : ï Latin small letter i with * diaeresis */, 240/* ð : ð Latin small letter eth */, 241/* ñ : ñ Latin small letter n with tilde */, 242/* ò : ò Latin small letter o with grave */, 243/* ó : ó Latin small letter o with acute */, 244/* * ô : ô Latin small letter o with circumflex */, 245/* * õ : õ Latin small letter o with tilde */, 246/* * ö : ö Latin small letter o with * diaeresis */, 247/* * ÷ : ÷ division sign */, 248/* * ø : ø Latin * small letter o with stroke */, 249/* * ù : ù * Latin small letter * u with grave */, 250/* * ú ; * : & #xfa ; * Latin * small * letter u * with acute */, 251/* * û * ; * : * & * #xfb * ; * Latin * small * letter * u * with * circumflex */, 252/* * ü : ü Latin small letter u with diaeresis */, 253/* * ý : ý Latin small letter y with acute */, 254/* * þ : þ Latin small letter thorn */, 255/* * ÿ : ÿ Latin small letter * y with diaeresis */, 338/* * Œ : Œ Latin * capital ligature oe */, 339/* * &oelig ; : & #x153 * ; Latin small * ligature oe */, 352/* * Š : * Š * Latin * capital * letter S * with caron */, 353/* * & * scaron * ; * : * & * #x161 * ; * Latin * small * letter * s * with * caron */, 376/* * Ÿ : Ÿ Latin capital letter Y with diaeresis */, 402/* * ƒ : ƒ Latin small letter f with hook */, 710/* * ˆ : ˆ modifier letter * circumflex accent */, 732/* * ˜ : ˜ small tilde */, 913/* * &Alpha ; : & #x391 ; Greek * capital letter Alpha */, 914/* * Β : Β * Greek capital * letter Beta */, 915/* * Γ : * Γ * Greek * capital * letter * Gamma */, 916/* * &Delta * ; * : * & * #x394 * ; * Greek * capital * letter * Delta */, 917/* * Ε : Ε Greek capital letter Epsilon */, 918/* * Ζ : Ζ Greek capital letter Zeta */, 919/* * Η : Η ; Greek capital letter Eta */, 920/* * &Theta ; : & #x398 ; Greek capital * letter Theta */, 921/* * Ι : Ι Greek * capital letter Iota */, 922/* * &Kappa ; : &# * x39a; Greek * capital letter * Kappa */, 923/* * Λ : * Λ * Greek * capital * letter * Lambda */, 924/* * &Mu * ; * : * & * #x39c * ; * Greek * capital * letter * Mu */, 925/* * Ν : Ν Greek capital letter Nu */, 926/* * Ξ : Ξ Greek capital letter Xi */, 927/* * Ο : Ο Greek capital letter * Omicron */, 928/* * &Pi ; : & #x3a0 ; Greek capital * letter Pi */, 929/* * Ρ : Ρ Greek * capital letter Rho */, 931/* * Σ : Σ * Greek capital * letter Sigma */, 932/* * Τ : * Τ * Greek * capital * letter Tau */, 933/* * & * Upsilon * ; * : * &# * x3a5 * ; * Greek * capital * letter * Upsilon */, 934/* * Φ : Φ Greek capital letter Phi */, 935/* * Χ : Χ ; Greek capital letter Chi */, 936/* * &Psi ; : & #x3a8 ; Greek capital letter * Psi */, 937/* * &Omega ; : &# x3a9; Greek capital * letter Omega */, 945/* * α : α Greek * small letter alpha */, 946/* * β : β ; * Greek small letter * beta */, 947/* * γ : * γ * Greek * small * letter * gamma */, 948/* * &delta * ; * : * &# * x3b4 * ; * Greek * small * letter * delta */, 949/* * ε : ε Greek small letter epsilon */, 950/* * ζ : ζ Greek small letter zeta */, 951/* * η : η ; Greek small letter eta */, 952/* * θ : θ Greek small * letter theta */, 953/* * ι : ι ; Greek * small letter iota */, 954/* * &kappa ; : & #x3ba * ; Greek small * letter kappa */, 955/* * &lambda ; * : &# x3bb; * Greek * small * letter * lambda */, 956/* * &mu * ; * : * & * #x3bc * ; * Greek * small * letter * mu */, 957/* * ν : ν Greek small letter nu */, 958/* ξ : ξ Greek small letter xi */, 959/* * & * omicron * ; : * & * #x3bf * ; * Greek * small * letter * omicron */, 960/* π : π Greek small letter pi */, 961/* * ρ : * ρ * Greek small * letter rho */, 962/* * &sigmaf * ; : * & * #x3c2 * ; * Greek * small * letter * final * sigma */, 963/* * σ : σ Greek small letter sigma */, 964/* * τ : τ ; Greek small letter tau */, 965/* * υ : υ Greek small letter * upsilon */, 966/* * φ : φ Greek small letter * phi */, 967/* * &chi ; : & #x3c7 ; Greek * small letter chi */, 968/* * ψ : ψ * Greek small letter * psi */, 969/* * ω : * ω * Greek * small * letter * omega */, 977/* * & * thetasym * ; * : * & * #x3d1 * ; * Greek * theta * symbol */, 978/* * ϒ : ϒ Greek upsilon with hook symbol */, 982/* ϖ : ϖ Greek pi symbol */, 8194/* *   * : *   * ; en * space */, 8195/* *   :   ; em space */, 8201/*   :   thin space */, 8204/* * ‌ * : * ‌ * ; zero * width * non * -joiner */, 8205/* * ‍ : ‍ zero width joiner */, 8206/* ‎ : ‎ left-to-right mark */, 8207/* * ‏ : ‏ right -to-left mark */, 8211/* – : – en dash */, 8212/* * &mdash ; : * & #x2014 ; * em dash */, 8216/* * ‘ : ‘ left single-6 quotation mark */, 8217/* * &rsquo ; : &# x2019 ; right single -9 quotation * mark */, 8218/* * ‚ : ‚ single low-9 * quotation mark */, 8220/* * “ : “ left * double-66 quotation mark */, 8221/* * ” : ” * right double-99 * quotation mark */, 8222/* * &bdquo ; : & * #x201e ; * double low * -99 * quotation * mark */, 8224/* * &dagger * ; : * & * #x2020 * ; * dagger */, 8225/* * ‡ : ‡ double dagger */, 8226/* * &bull ; : & #x2022 ; bullet */, 8230/* * &hellip ; : &# x2026 ; horizontal * ellipsis */, 8240/* * ‰ : ‰ per mille * sign */, 8242/* * ′ : ′ * prime */, 8243/* * &Prime ; : * &# x2033 ; * double prime */, 8249/* * &lsaquo * ; : * & * #x2039 * ; * single * left * - * pointing * angle * quotation * mark */, 8250/* * › : › single right-pointing angle quotation * mark */, 8254/* * ‾ : ‾ overline */, 8260/* * &frasl ; : & #x2044 ; fraction slash */, 8364/* * € : € Euro * currency sign */, 8465/* * &image ; : &# x2111 ; * black - letter * capital i */, 8472/* * ℘ : * ℘ * script * capital p */, 8476/* * &real * ; : * & * #x211c * ; * black * - * letter * capital * r */, 8482/* * ™ : ™ trademark sign */, 8501/* * &alefsym ; : & #x2135 ; alef symbol */, 8592/* * ← : ← ; leftwards arrow */, 8593/* * ↑ : ↑ upwards * arrow */, 8594/* * → : → * rightwards arrow */, 8595/* * ↓ : * ↓ ; * downwards * arrow */, 8596/* * &harr * ; : * & * #x2194 * ; * left * right * arrow */, 8629/* * ↵ : ↵ downwards arrow with corner leftwards */, 8656/* * ⇐ : ⇐ ; leftwards double arrow */, 8657/* * &uArr ; : & #x21d1 ; upwards double * arrow */, 8658/* * ⇒ : ⇒ rightwards * double arrow */, 8659/* * ⇓ : ⇓ * downwards double * arrow */, 8660/* * ⇔ : * ⇔ ; * left right * double arrow */, 8704/* * &forall * ; : * & * #x2200 * ; * for * all */, 8706/* * ∂ : ∂ partial differential */, 8707/* * ∃ : ∃ there exists */, 8709/* ∅ : ∅ empty set */, 8711/* * ∇ : ∇ nabla */, 8712/* * ∈ : ∈ element of */, 8713/* ∉ : ∉ not an element of */, 8715/* * ∋ : * ∋ * contains * as member */, 8719/* ∏ : ∏ n-ary product */, 8721/* * ∑ : * ∑ n-ary * summation */, 8722/* * &minus * ; : * &# * x2212 * ; * minus * sign */, 8727/* * ∗ : ∗ asterisk operator */, 8730/* * √ : √ square root */, 8733/* * ∝ : ∝ proportional to */, 8734/* * &infin ; : & #x221e ; infinity */, 8736/* * ∠ : ∠ ; * angle */, 8743/* * ∧ : * ∧ * logical and */, 8744/* * ∨ * : * &# * x2228 * ; * logical * or */, 8745/* * ∩ : ∩ intersection */, 8746/* ∪ : ∪ union */, 8747/* * ∫ : * ∫ * integral */, 8756/* * &there4 * ; : * &# * x2234 * ; * therefore */, 8764/* * &sim ; : & #x223c ; tilde operator */, 8773/* * ≅ : ≅ ; congruent to */, 8776/* ≈ : ≈ asymptotic to */, 8800/* * ≠ : ≠ not equal to */, 8801/* * &equiv ; : &# x2261 ; identical to */, 8804/* * ≤ : ≤ less-than or equal to */, 8805/* * ≥ : ≥ greater -than * or equal to */, 8834/* * ⊂ : ⊂ * subset of */, 8835/* * ⊃ : * ⊃ ; * superset of */, 8836/* * &nsub * ; : * & * #x2284 * ; * not * a * subset * of */, 8838/* * ⊆ : ⊆ ; subset of or equal to */, 8839/* * ⊇ : ⊇ superset of or equal to */, 8853/* * &oplus ; : &# x2295 ; circled plus */, 8855/* * ⊗ : ⊗ circled * times */, 8869/* * &perp ; : & #x22a5 ; * up tack */, 8901/* * ⋅ : * ⋅ ; * dot operator */, 8968/* * &lceil * ; : * & * #x2308 * ; * left * ceiling */, 8969/* * ⌉ : ⌉ right ceiling */, 8970/* * &lfloor ; : &# x230a ; left floor */, 8971/* ⌋ : ⌋ right floor */, 9001/* * ⟨ : 〈 ; left- pointing angle bracket */, 9002/* * ⟩ : 〉 ; right - pointing angle * bracket */, 9674/* ◊ : ◊ lozenge */, 9824/* * &spades * ; : * & * #x2660 * ; * black * spade * suit */, 9827/* * ♣ : ♣ black club suit */, 9829/* * &hearts ; : &# x2665 ; black heart suit */, 9830/* * ♦ : ♦ black diamond suit */, }; // allow 50% extra space for faster lookup. entityToChar = new HashMap( entityKeys.length * 150 / 100); for (int i = 0; i < entityKeys.length; i++) { // leave out nbsp so it can be specially handled if entity not // found. if (!entityKeys[i].equals("nbsp")) { entityToChar.put(entityKeys[i], entityValues[i]); } // add also ' for strip but not insert. optional for XML, not // used in HTML. entityToChar.put("apos", (char) 39); } }// end static /** * convert an entity to a single char. * * @param bareEntity * String entity to convert convert. must have lead & and trail ; * stripped; may have form: #x12ff or #123 or lt or nbsp style * entity. Works faster if entity in lower case. * @param howToTranslateNbsp * char you would like   translated to, usually ' ' or (char) * 160 * * @return equivalent character. 0 if not recognised. */ public static char bareHTMLEntityToChar(String bareEntity, char howToTranslateNbsp) { // first check for alpha entity Character code = entityToChar.get(bareEntity); if (code != null) { return code; } code = entityToChar.get(bareEntity.toLowerCase()); if (code != null) { return code; } // nbsp is not in hashMap. We test for it specially. if (bareEntity.length() == 4 && bareEntity.equals("nbsp") || bareEntity.equals("NBSP")) { return howToTranslateNbsp; } // check at least have &_#1_; (no & or ; at this point ) if (bareEntity.length() < 2) { return 0; } try { if (bareEntity.charAt(0) == '#') { final char secondChar = bareEntity.charAt(1); if (secondChar == 'x' || secondChar == 'X') { // handle hex entities of form &_#x12ff_; // ensure at least have &_#xf_; if (bareEntity.length() < 3) { return 0; } // had &_#x123D_; return (char) Integer.parseInt(bareEntity.substring(2), /* hex */ 16); } else { // handle decimal entities // had &_#123_; return (char) Integer.parseInt(bareEntity.substring(1)); } } else { // some unrecognized/malformed bareEntity return 0; } } catch (NumberFormatException e) { return 0; } }// end entityToChar /** * strips tags and entities from HTML. Leaves \n \r unchanged. * * @param text * to flatten * @param translateNbspTo * char you would like   translated to, usually ' ' or * (char) 160 . * * @return flattened text */ public static String flattenHTML(String text, char translateNbspTo) { return stripHTMLEntities(stripHTMLTags(text), translateNbspTo); } /** * strips tags and entities from XML.. * * @param text * to flatten * * @return flattened text */ public static String flattenXML(String text) { return stripXMLEntities(stripXMLTags(text)); } /** * Checks a number of gauntlet conditions to ensure this is a valid entity. * Converts Entity to corresponding char. * * @param possBareEntityWithSemicolon * string that may hold an entity. Lead & must be stripped, but * may optionally contain text past the ; * @param translateNbspTo * char you would like nbsp translated to, usually ' ' or (char) * 160 . * * @return corresponding unicode character, or 0 if the entity is invalid. */ protected static char possBareHTMLEntityWithSemicolonToChar( String possBareEntityWithSemicolon, char translateNbspTo) { if (possBareEntityWithSemicolon.length() < SHORTEST_ENTITY - 1) { return 0; } // find the trailing ; int whereSemi = possBareEntityWithSemicolon.indexOf(';', SHORTEST_ENTITY - 2/* where start looking */); if (whereSemi < SHORTEST_ENTITY - 2) { return 0; } return bareHTMLEntityToChar( possBareEntityWithSemicolon.substring(0, whereSemi), translateNbspTo); } /** * Checks a number of gauntlet conditions to ensure this is a valid entity. * Converts Entity to corresponding char. * * @param possBareEntityWithSemicolon * string that may hold an entity. Lead & must be stripped, but * may optionally contain text past the ; * * @return corresponding unicode character, or 0 if the entity is invalid. * nbsp -> (char) 160 */ public static char possEntityToChar(String possBareEntityWithSemicolon) { return possBareHTMLEntityWithSemicolonToChar( possBareEntityWithSemicolon, UNICODE_NBSP_160_0x0a); } /** * Prepares tags for removal, to ensure they are replaced by a space * * *
*

* *

--> _ * ' ') { // insert space before < sb.append(' '); } break; } } } sb.append(c); prevChar = c; } return sb.toString(); } /** * Converts HTML to text converting entities such as " back to " and * < back to < Ordinary text passes unchanged. Also strips decimal and * hex entities and stray HTML entities. * * @param text * raw text to be processed. Must not be null. * @param translateNbspTo * char you would like   translated to, usually ' ' or * (char) 160 . * * @return translated text. It also handles HTML 4.0 entities such as * ♥ { and ￿   -> 160. null input returns * null. */ public static String stripHTMLEntities(String text, char translateNbspTo) { if (text == null) { return null; } if (text.indexOf('&') < 0) { // are no entities, nothing to do return text; } int originalTextLength = text.length(); StringBuilder sb = new StringBuilder(originalTextLength); for (int i = 0; i < originalTextLength; i++) { int whereAmp = text.indexOf('&', i); if (whereAmp < 0) { // no more &s, we are done // append all remaining text sb.append(text.substring(i)); break; } else { // append all text to left of next & sb.append(text.substring(i, whereAmp)); // avoid reprocessing those chars i = whereAmp; // text.charAt(i) is an & // possEntity has lead & stripped. String possEntity = text.substring(i + 1, Math.min(i + LONGEST_ENTITY, text.length())); char t = possBareHTMLEntityWithSemicolonToChar(possEntity, translateNbspTo); if (t != 0) { // was a good entity, keep its equivalent char. sb.append(t); // avoid reprocessing chars forming the entity int whereSemi = possEntity .indexOf(";", SHORTEST_ENTITY - 2); i += whereSemi + 1; } else { // treat & just as ordinary character sb.append('&'); } }// end else }// end for // if result is not shorter, we did not do anything. Saves RAM. return (sb.length() == originalTextLength) ? text : sb.toString(); }// end stripEntities // -------------------------- STATIC METHODS -------------------------- /** * remove all text between <applet.. </applet>, <style... * </style> <script... </script> * * @param s * HTML string to strip tag pairs out of. * * @return string with tag pairs stripped out. */ private static String stripHTMLTagPairs(String s) { String[] tags = { "applet", "APPLET", "style", "STYLE", "script", "SCRIPT" }; for (final String tag : tags) { final String beginTag = "<" + tag; final String endTag = ""; int begin = 0; while (begin < s.length() && (begin = s.indexOf(beginTag, begin)) >= 0) { final int end; if ((end = s.indexOf(endTag, begin + beginTag.length())) > 0) { // chop out the s = s.substring(0, begin) + s.substring(end + endTag.length()); } else { // no matching end tag, chop off entire end s = s.substring(0, begin); } } } return s; } /** * Removes tags from HTML leaving just the raw text. Leaves entities as is, * e.g. does not convert & back to &. similar to code in Quoter. Also * removes <!-- --> comments. Presumes perfectly formed HTML, no > * in comments, all <...> balanced. Also removes text between applet, * style and script tag pairs. Leaves   and other entities as is. * * @param html * input HTML * * @return raw text, with whitespaces collapsed to a single space, trimmed. */ public static String stripHTMLTags(String html) { html = stripHTMLTagPairs(html); return stripIndividualTags(html); } /** * Removes tags from HTML leaving just the raw text. Leaves entities as is, * e.g. Presumes perfectly formed HTML. * * etc removed leaving nothing behind. * * @param html * input HTML or XML * * @return raw text, with whitespaces collapsed to a single space, trimmed. */ private static String stripIndividualTags(String html) { html = html.trim(); if (html.indexOf('<') < 0) { return html; } // condition String so that some tags will always turn into space. html = preStripIndividualTags(html); int numChars = html.length(); // will only shrink. StringBuilder result = new StringBuilder(numChars); /** * are we inside a tag, eg. inside */ boolean inside = false; /** * Have we cleaned any White Space? */ boolean cleanedAnyWhitespace = false; /** * Was the last char we saw a space? We use this to collapse spaces. */ boolean lastCharSpace = false; for (int i = 0; i < numChars; i++) { char c = html.charAt(i); switch (c) { default: if (c < ' ') { // handle stray whitespace if (!inside) { lastCharSpace = true; cleanedAnyWhitespace = true; } } else { // ordinary character, ignored inside a tag if (!inside) { if (lastCharSpace) { // deal with pending whitespace result.append(' '); lastCharSpace = false; } result.append(c); } } break; case '<': inside = true; // ignore break; case '>': inside = false; // ignore break; case ' ': if (!inside) { lastCharSpace = true; } break; // whitespace case '\r': case '\t': case '\n': case 127: case UNICODE_NBSP_160_0x0a: if (!inside) { lastCharSpace = true; cleanedAnyWhitespace = true; } break; } }// end for // return original string trimmed if we did not really change anything return (cleanedAnyWhitespace || result.length() != numChars) ? result .toString().trim() : html; } /** * Converts XML to text converting entities such as " back to " and * < back to < Ordinary text passes unchanged. Also strips decimal and * hex entities and stray HTML entities. * * @param text * raw XML text to be processed. Must not be null. * * @return translated text. null input returns null. */ public static String stripXMLEntities(String text) { return stripHTMLEntities(text, ' '); } /** * Removes tags from XML leaving just the raw text. Leaves entities as is, * e.g. does not convert & back to &. similar to code in Quoter. Also * removes <!-- --> comments. Presumes perfectly formed XML, no > * in comments, all <...> balanced. Leaves entities as is. * * @param xml * input XML * * @return raw text, with whitespaces collapsed to a single space, trimmed. */ public static String stripXMLTags(String xml) { return stripIndividualTags(xml); } }