lib/srx/segment.srx in srx-languagetool-0.8.0 vs lib/srx/segment.srx in srx-languagetool-0.9.0

- old
+ new

@@ -2,12 +2,14 @@ <srx xmlns="http://www.lisa.org/srx20" xmlns:okpsrx="http://okapi.sf.net/srx-extensions" version="2.0"> <header segmentsubflows="yes" cascade="yes"> <formathandle type="start" include="no"></formathandle> <formathandle type="end" include="yes"></formathandle> <formathandle type="isolated" include="no"></formathandle> -<okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes" useIcu4JBreakRules="no" treatIsolatedCodesAsWhitespace="no"></okpsrx:options> -<okpsrx:sample language="sr" useMappedRules="yes">Поштовани господине одн. госпођо. Видео сам </okpsrx:sample> +<okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes"></okpsrx:options> +<okpsrx:sample language="pl" useMappedRules="yes">Als een hoogleraar met emeritaat ('pensioen') is, mag hij de functieaanduiding prof. blijven gebruiken, maar hij heeft tevens het recht gekregen om het bijvoeglijk naamwoord emeritus (Latijn voor 'uitgediend') aan zijn functietitel toe te voegen: em. prof. dr. +Tussen de twee wereldoorlogen vestigde prof. ir. Messerschmitt zich in Augsburg waar hij met behulp van een oudere, rijke vriendin (met wie hij later trouwde) zijn eerste vliegtuigen bouwde, het waren passagierstoestellen. +250 p. n.e.</okpsrx:sample> <okpsrx:rangeRule></okpsrx:rangeRule> </header> <body> <languagerules> <languagerule languagerulename="Greek"> @@ -1082,10 +1084,15 @@ <!--This is a text ("with a small remark!") that continues.--> <rule break="no"> <beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u0002]*\p{Pe}\s</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> +<!--p. n.e. (błędny podział wiersza)--> +<rule break="no"> +<beforebreak>p\.\s</beforebreak> +<afterbreak>n\.\s?e\.</afterbreak> +</rule> <rule break="yes"> <beforebreak>[\.!?…]['"\p{Pe}\u00BB\u2019\u201D\u203A\u0002¹²³]*\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="yes"> @@ -1104,108 +1111,108 @@ <languagerule languagerulename="English"> <rule break="no"> <beforebreak>[\u00A0\s]</beforebreak> <afterbreak>\n</afterbreak> </rule> -<rule break="no"><!-- Hello (Hi! ) my name is Chris --> +<rule break="no"> <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak> <afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak> </rule> <rule break="no"> <beforebreak>Yahoo![\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> -<rule break="no"><!-- U.S.A (no dot at end) --> +<rule break="no"> <beforebreak>[A-Z]\.[A-Z]\.</beforebreak> <afterbreak>[A-Z]\b</afterbreak> </rule> -<rule break="no"><!-- A.I (no dot at end) --> +<rule break="no"> <beforebreak>\bA\.</beforebreak> <afterbreak>I\b</afterbreak> </rule> -<rule break="no"><!-- S.I (no dot at end) --> +<rule break="no"> <beforebreak>\bS\.</beforebreak> <afterbreak>I\b</afterbreak> </rule> -<rule break="no"><!-- L.A (no dot at end) --> +<rule break="no"> <beforebreak>\bL\.</beforebreak> <afterbreak>A\b</afterbreak> </rule> -<rule break="no"><!-- U.S (no dot at end) --> +<rule break="no"> <beforebreak>\bU\.</beforebreak> <afterbreak>[SK]\b</afterbreak> </rule> -<rule break="no"><!-- I.S (no dot at end) --> +<rule break="no"> <beforebreak>\bI\.</beforebreak> <afterbreak>S\b</afterbreak> </rule> -<rule break="no"><!-- M.Z (no dot at end) --> +<rule break="no"> <beforebreak>\bM\.</beforebreak> <afterbreak>Z\b</afterbreak> </rule> -<rule break="no"><!-- URLs without "www."--> +<rule break="no"> <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak> </rule> -<rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)--> +<rule break="no"> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak> </rule> -<rule break="no"><!-- No. 5 --> +<rule break="no"> <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak> <afterbreak>\p{N}</afterbreak> </rule> -<rule break="no"><!-- Ph.D. --> +<rule break="no"> <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak> <afterbreak>D\.?</afterbreak> </rule> -<rule break="no"><!-- min. --> +<rule break="no"> <beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak> <afterbreak>[^\p{Lu}]|I</afterbreak> </rule> -<rule break="no"><!-- hr. --> +<rule break="no"> <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak> <afterbreak>[^\p{Lu}]|I</afterbreak> </rule> -<rule break="no"><!-- Fig. 8 --> +<rule break="no"> <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak> <afterbreak>\p{N}|[IXV]+</afterbreak> </rule> -<rule break="no"><!-- Fig. (8) --> +<rule break="no"> <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak> <afterbreak>\(\p{N}\)</afterbreak> </rule> -<rule break="no"><!-- I'm (...) great! --> +<rule break="no"> <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak> <afterbreak>[^\p{P}]</afterbreak> </rule> -<rule break="no"><!-- I will work with someone (Chris or ...?). --> +<rule break="no"> <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak> <afterbreak>[^\p{P}]</afterbreak> </rule> -<rule break="no"><!-- e.g. --> +<rule break="no"> <beforebreak>\be\.g\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- vs. --> +<rule break="no"> <beforebreak>\b[Vv]s\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- pp. --> +<rule break="no"> <beforebreak>\b(pp|PP)\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- esp. --> +<rule break="no"> <beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <!--"Etc." can end the sentence, so we check for the uppercase letter after it.--> -<rule break="no"><!-- Etc. --> +<rule break="no"> <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak> <afterbreak>[^\p{Lu}]</afterbreak> </rule> -<rule break="no"><!-- BTW (by the way) --> +<rule break="no"> <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\bJan\.[\s\u00A0]</beforebreak> @@ -1249,43 +1256,43 @@ </rule> <rule break="no"> <beforebreak>(?i)FRITZ!</beforebreak> <afterbreak>(?i)Box</afterbreak> </rule> -<rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 --> +<rule break="no"> <beforebreak>ID.</beforebreak> <afterbreak>3|4|Buzz|Crozz</afterbreak> </rule> -<rule break="no"><!-- Ph.D. (see rule PH_D) --> +<rule break="no"> <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) --> +<rule break="no"> <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) --> +<rule break="no"> <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- B.Eng. (Bachelor of Engineering) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Eng\.?</afterbreak> </rule> -<rule break="no"><!-- LL.B. (Bachelor of Laws) --> +<rule break="no"> <beforebreak>\bLL\.[\s\u00A0]?</beforebreak> <afterbreak>[BM]\.?</afterbreak> </rule> -<rule break="no"><!-- B.Sc. (Bachelor of Science) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Sc\.?</afterbreak> </rule> -<rule break="no"><!-- B.Comp. (Bachelor of Computing) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Comp?\.?</afterbreak> </rule> -<rule break="no"><!-- B.Arch. (Bachelor of Architecture) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Arch\.?</afterbreak> </rule> <rule break="no"> <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak> @@ -1373,11 +1380,11 @@ </rule> <rule break="no"> <beforebreak>\b\p{L}\.</beforebreak> <afterbreak>\p{L}\.</afterbreak> </rule> -<rule break="no"><!-- Jones v. Smith --> +<rule break="no"> <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak> <afterbreak>\p{Lu}\p{L}+</afterbreak> </rule> <rule break="yes"> <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak> @@ -1386,11 +1393,11 @@ <rule break="yes"> <beforebreak>\bOK\.[\s\u00A0]</beforebreak> <afterbreak>\p{Ll}+</afterbreak> </rule> <rule break="no"> -<beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations--> +<beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak> <afterbreak>[\p{N}\p{Ll}]</afterbreak> </rule> <rule break="no"> <beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak> <afterbreak>[^\p{Lu}]</afterbreak> @@ -1417,12 +1424,12 @@ </rule> <rule break="no"> <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- i.e. --> -<beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence --> +<rule break="no"> +<beforebreak>i\.e\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <rule break="yes"> <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*[\s\u00A0]</beforebreak> <afterbreak></afterbreak> @@ -1530,113 +1537,124 @@ <afterbreak>\p{Lu}\p{Ll}</afterbreak> </rule> </languagerule> <languagerule languagerulename="Dutch"> <rule break="no"> -<!-- sp.a --> <beforebreak>\b(sp|SP)</beforebreak> <afterbreak>\.[aA]\b</afterbreak> </rule> <rule break="no"> -<!-- .Net --> <beforebreak>\s[.]</beforebreak> <afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak> </rule> -<rule break="no"><!-- quoted sentence in sentence --> +<rule break="no"> <beforebreak>[.?!][’'"]</beforebreak> <afterbreak> [a-z]</afterbreak> </rule> -<rule break="no"><!-- URLs without "www."--> +<rule break="no"> <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak> </rule> -<rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)--> +<rule break="no"> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak> </rule> -<rule break="no"><!-- Abbreviated books of the Bible and biblical apocrypha--> +<rule break="no"> +<beforebreak>\b(blz|pag|fig)\.\s</beforebreak> +<afterbreak>[0-9]</afterbreak> +</rule> +<!--Abbrevs that can happen in sentence and at end--> +<rule break="no"> +<beforebreak>\b(enz|etc|zat|ambt|al|ver|art|wed|lab|bv|Bros)\.\s</beforebreak> +<afterbreak>\p{Ll}</afterbreak> +</rule> +<rule break="no"> <beforebreak>\b(Ge?n|Ex|Le?v|Nu?m|D(eu)?t|Jo?z|Ri|R[ei]cht|Sa?m|Ko?n|Kr[on]{0,2}|Neh?|Est?|Jb|Ps|Spr?|Pr[ed]{0,2}|H(oog)?l|Je?s|Je?r|Kl(aagl)?|Ez(ech)?|Da?n|Ho?s|Jl|Am|Ob|Mc|Mi[ch]{0,2}|Nah?|Hk|Hab|Zf|[SZ]ef|Ha?g|Zc|Zach|Ma?l|Ma?t|Mk|Mar|Lk|Jh|H(an)?d|Ro?m|Kor|Ga?l|Ef|Fp|Fil|Ko|[CK]ol|Th|Th?e[s]{1,2}|Tm|Ti?t|Fm|Fil(em)?|Hb|Hebr?|Jk|Ja[ck]|Pe?tr?|Joh|Jud|Op(enb)?|Wijsh|Tob|Sir|Bar|Makk)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros|Stb)\.\s</beforebreak> -<afterbreak></afterbreak> +<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|(Prof\.)?[Dd]r|Em|Fa|Kon|Stb)\.\s</beforebreak> +<afterbreak>\p{Lu}</afterbreak> </rule> <rule break="no"> -<beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr|Jac)\.\s</beforebreak> +<beforebreak>\b([Mm]ej|[Mm]evr|[Mm]rs|[Mm]s|[Mm]gr|[Mm]w|Ndl|Ned|Nl|No|Prof|[Ss]ecr|Chr|Jac|[Ww]ed)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b(Sr|St|Ued|Vz|aanh|aanw|aardew|aardr)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(abs|abstr|adj|adm|afb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|bv|afm|evt|exp)\.\s</beforebreak> +<beforebreak>\b(abs|abstr|adj|adm|[Aa]fb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|afm|evt|exp|vs)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(al|ald|alg|amb|ambt|anat|antrop|apoth)\.\s</beforebreak> +<beforebreak>\b(ald|alg|amb|anat|antrop|apoth)\.\s</beforebreak> <afterbreak></afterbreak> </rule> +<rule break="yes"> +<beforebreak>\seen\sprof\.\s</beforebreak> +<afterbreak>\p{Lu}</afterbreak> +</rule> <rule break="no"> <beforebreak>\b(alc|bro|opm|acc)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(arch|archeol|art|bc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak> +<beforebreak>\b(arch|archeolbc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(bijz|blz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak> +<beforebreak>\b(bijz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b(conf|ct|dal|derg|dhr|dir|div|dra|drs|ds)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(ed|em|enz|etc|ev|[Ee]xcl|fa|fam|fig|fin|fl|fr.)\.\s</beforebreak> +<beforebreak>\b([Ee]d|em|ev|[Ee]xcl|[Ff]a|[Ff]am|[fF]ig|fin|fl|fr)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b(geb|[Gg]em|get|gld|id|[Ii]ncl|ind|inf|ing|intern|inz|ir|jhr|jkvr)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(jl|jr|kr|kt|lab|lic|ll|lt|lw|max|mevr|mi|[Mm]in|mld)\.\s</beforebreak> +<beforebreak>\b(jl|jr|kr|kt|lic|ll|lt|lw|max|[Mm]evr|mi|[Mm]in|mld)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(mln|mr|mw|nl|no|nr|nrs|ob|obl|ong|onov|o.a)\.\s</beforebreak> +<beforebreak>\b(mln|[Mm]r|[Mm]w|nl|no|nr|nrs|ob|obl|ong|onov)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(opm|org|ov|pag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak> +<beforebreak>\b(opm|org|ov|[Pp]ag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|tk)\.\s</beforebreak> +<beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|prof|tk)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b([A-Z]|Adr|Chr|Fr|Fred|IJ|Jac|Joh|Ph|St|Th|Tj|v|v\.(\s)?d)\.(\s)?</beforebreak> -<afterbreak>[A-Z]</afterbreak> +<afterbreak>\p{Lu}</afterbreak> </rule> <rule break="no"> <beforebreak>\b[vn]\.\s</beforebreak> <afterbreak>Chr</afterbreak> </rule> <rule break="no"> -<beforebreak>\b(uitsl|ver|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak> -<afterbreak></afterbreak> +<beforebreak>\b(uitsl|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak> +<afterbreak>\p{Ll}</afterbreak> </rule> <rule break="no"> -<beforebreak>\b(mm|cm|km|mg|kg|h|kW|mW)\.\s</beforebreak> +<beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak> <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak> </rule> <rule break="yes"> -<beforebreak>\b(mm|cm|km|ml|kg|kW|h|mg)\.\s</beforebreak> +<beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>[\[\(]*…[\]\)]* </beforebreak> <afterbreak>\p{Ll}</afterbreak> @@ -1684,31 +1702,19 @@ <rule break="no"> <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b\p{Lu}\p{Ll}\.\s?</beforebreak> -<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak> -</rule> -<rule break="no"> <beforebreak>\.\p{Lu}\p{Ll}\.\s?</beforebreak> <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak> </rule> <!--a number with a dot before a lowercase char--> <rule break="no"> <beforebreak>\b\d+\.\s</beforebreak> <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak> </rule> -<rule break="yes"> -<beforebreak>\been\sprof\.\s</beforebreak> -<afterbreak>[^\p{Ll}]</afterbreak> -</rule> <rule break="no"> -<beforebreak>\bprof\.\s</beforebreak> -<afterbreak></afterbreak> -</rule> -<rule break="no"> <beforebreak>[.!?…][’'"]\s</beforebreak> <afterbreak>[a-z]</afterbreak> </rule> <rule break="no"> <beforebreak>[.][.]\s</beforebreak> @@ -1721,15 +1727,15 @@ <rule break="no"> <beforebreak>Warner Bros\.</beforebreak> <afterbreak>[a-z]</afterbreak> </rule> <rule break="yes"> -<beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak> +<beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="yes"> -<beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak> +<beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak> <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak> </rule> <rule break="yes"> <beforebreak>\s\p{L}[\.!?…]\s</beforebreak> <afterbreak>\p{Lu}\p{Ll}</afterbreak> @@ -1766,35 +1772,33 @@ <rule break="yes"> <beforebreak>[?!.]\s</beforebreak> <afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak> </rule> <rule break="no"> -<!-- "E. coli etc. --> <beforebreak>"[A-Z][.]\s</beforebreak> <afterbreak>[a-z]</afterbreak> </rule> <rule break="no"> -<!-- Cornelisz. --> <beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak> <afterbreak>[a-z]</afterbreak> </rule> <rule break="no"> -<!-- De n. XIV/vagus (nervus) --> <beforebreak>De n[.]\s</beforebreak> <afterbreak>[a-z]|[XIV]</afterbreak> </rule> <rule break="no"> -<!-- MOL.E --> <beforebreak>[A-Z]{2,5}[.]</beforebreak> <afterbreak>[A-Z]</afterbreak> </rule> <rule break="no"> -<!-- ..." betekent --> <beforebreak>\.\.</beforebreak> <afterbreak>" [a-z]</afterbreak> </rule> -<!-- ##### end of Dutch #### --> +<rule break="no"> +<beforebreak>\sBTW\.</beforebreak> +<afterbreak>\p{Ll}</afterbreak> +</rule> </languagerule> <languagerule languagerulename="Slovak"> <rule break="no"> <beforebreak>\b(Bc|Mgr|RNDr|PharmDr|PhDr|JUDr|PaedDr|ThDr|Ing|MUDr|MDDr|MVDr|Dr|ThLic|PhD|ArtD|ThDr|Dr|DrSc|CSs|prof)\.\s</beforebreak> <afterbreak></afterbreak> @@ -4368,11 +4372,11 @@ <beforebreak>\b(н|наб|нач|неуд|нем|ном|о|обл|обр|общ|ок|ост|отл|п|пер|Пер|перераб|пл|пос|пр|пром|просп|Просп|проф|Проф)\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b(р|ред|Рис|рус|с|сб|св|См|см|сов|соч|соц|спец|ср|ст|стр|т|тел|Тел|тех|тов|тт|туп)\.\s</beforebreak> -<afterbreak></afterbreak> +<afterbreak>\p{Ll}</afterbreak> </rule> <rule break="no"> <beforebreak>\b(руб|Руб|тыс|Тыс|трлн)\.\s</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> @@ -4652,11 +4656,11 @@ <rule break="no"> <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0]</beforebreak> <afterbreak>[XIV\d]+\b</afterbreak> </rule> <rule break="no"> -<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> +<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak> </rule> <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. --> <rule break="no"> <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> @@ -4719,16 +4723,14 @@ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak> <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak> </rule> </languagerule> <languagerule languagerulename="Spanish"> - <rule break="no"> <beforebreak>¿[^?]+:[\s\u00A0]</beforebreak> <afterbreak>.</afterbreak> </rule> - <rule break="no"> <beforebreak>Yahoo![\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <rule break="no"> @@ -4740,11 +4742,11 @@ <afterbreak></afterbreak> </rule> <!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] --> <rule break="no"> <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <!-- Ellipsis: ... lowercase --> <rule break="no"> <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> @@ -4770,43 +4772,41 @@ <rule break="no"> <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak> </rule> <rule break="no"> -<!-- URLs without "www."--> <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak> </rule> <rule break="no"> -<!-- Subdomains without "www." (e.g. foo.MyDomain.com)--> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak> </rule> <!-- Abbreviations that cannot finish sentences--> <rule break="no"> <beforebreak>\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <!-- Abbreviations that can finish sentences --> <rule break="no"> <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VUuv]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak> @@ -4835,11 +4835,11 @@ <afterbreak>\p{Ll}</afterbreak> </rule> <!-- Composed abbrev. --> <rule break="no"> <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> -<afterbreak/> +<afterbreak></afterbreak> </rule> <!-- Units --> <rule break="no"> <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> @@ -4857,15 +4857,15 @@ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak> <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak> </rule> </languagerule> <languagerule languagerulename="German"> -<rule break="no"><!-- URLs without "www."--> +<rule break="no"> <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak> </rule> -<rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)--> +<rule break="no"> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak> </rule> <!--support simple lists in markdown style--> <rule break="yes"> @@ -4880,31 +4880,31 @@ <!-- Don't split at e.g. "d. h." --> <rule break="no"> <beforebreak>[^-\p{L}'’/°]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- special case: "Das 1. Internationale Filmfestival findet nächste Woche statt." --> +<rule break="no"> <beforebreak>([Dd](as|er|ie|iese[rsmn]?|en|em)|[kmsd]?ein(e[rsnm]?)?|am|fürs|ins|zum|im|am|zur) \d+\.[\u00A0\s]+</beforebreak> <afterbreak>[A-ZÄÖÜ].*</afterbreak> </rule> <rule break="no"> -<beforebreak>Ust.</beforebreak><!-- needed for German rule UST_ID --> +<beforebreak>Ust.</beforebreak> <afterbreak>Id</afterbreak> </rule> <rule break="no"> -<beforebreak>Prof.</beforebreak><!-- needed for German rule ABKUERZUNG_LEERZEICHEN --> +<beforebreak>Prof.</beforebreak> <afterbreak>Dr</afterbreak> </rule> <rule break="no"> -<beforebreak>Dr.</beforebreak><!-- needed for German rule ABKUERZUNG_LEERZEICHEN --> +<beforebreak>Dr.</beforebreak> <afterbreak>iur|med|oec|phil|rer|theol</afterbreak> </rule> <rule break="no"> <beforebreak>(?i)FRITZ!</beforebreak> <afterbreak>(?i)Box</afterbreak> </rule> -<rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 --> +<rule break="no"> <beforebreak>ID.</beforebreak> <afterbreak>3|4|Buzz|Crozz</afterbreak> </rule> <rule break="no"> <beforebreak>[1-3]\.[\u00A0\s]{1,2}</beforebreak> @@ -4970,11 +4970,11 @@ <beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak> <afterbreak></afterbreak> </rule> <!-- don't split at cases like "Friedrich II. wird auch..." --> <rule break="no"> -<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak> +<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak> <afterbreak>[^\p{Lu}]+</afterbreak> </rule> <!-- don't split at cases like "im 13. oder 14. Jahrhundert" --> <rule break="no"> <beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak> @@ -5010,15 +5010,15 @@ <beforebreak>\b(spp?)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> </rule> <!-- German abbreviations --> <rule break="no"> -<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak> +<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|[Pp]arl|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|exkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak> +<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> @@ -5026,15 +5026,15 @@ <rule break="no"> <beforebreak>\b[BM]\.[\u00A0\s]Sc\.[\u00A0\s]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <rule break="no"> -<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak> +<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak> +<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|[Gg]em|Pat|prov|Betr|lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mio|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> @@ -5046,11 +5046,15 @@ <rule break="no"> <beforebreak>([A-ZÖÄÜ][a-zöäüß]+str)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <rule break="no"> -<beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak> +<beforebreak>\d+\.\d+\.[\u00A0\s]</beforebreak> +<afterbreak>[\-–][\u00A0\s]\d+</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak> <afterbreak></afterbreak> </rule> <!-- Break rules --> <rule break="yes"> <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak> @@ -5157,31 +5161,31 @@ <!-- English abbreviations - but these work globally for all languages --> <rule break="no"> <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sep|Sept|Oct|Okt|Nov|Dec|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co)\.\s</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- Ph.D. --> +<rule break="no"> <beforebreak>\bP[Hh]\.\s?</beforebreak> <afterbreak>D\.?</afterbreak> </rule> -<rule break="no"><!-- B.Eng. (Bachelor of Engineering) --> +<rule break="no"> <beforebreak>\b[BM]\.\s?</beforebreak> <afterbreak>Eng\.?</afterbreak> </rule> -<rule break="no"><!-- LL.B. (Bachelor of Laws) --> +<rule break="no"> <beforebreak>\bLL\.\s?</beforebreak> <afterbreak>[BM]\.?</afterbreak> </rule> -<rule break="no"><!-- B.Sc. (Bachelor of Science) --> +<rule break="no"> <beforebreak>\b[BM]\.\s?</beforebreak> <afterbreak>Sc\.?</afterbreak> </rule> -<rule break="no"><!-- B.Comp. (Bachelor of Computing) --> +<rule break="no"> <beforebreak>\b[BM]\.\s?</beforebreak> <afterbreak>Comp?\.?</afterbreak> </rule> -<rule break="no"><!-- B.Arch. (Bachelor of Architecture) --> +<rule break="no"> <beforebreak>\b[BM]\.\s?</beforebreak> <afterbreak>Arch\.?</afterbreak> </rule> <!-- Danish abbreviations - Word Boundary \b abbreviation dot \. --> <rule break="no"> @@ -5307,20 +5311,19 @@ </rule> <rule break="yes"> <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- URLs without "www."--> +<rule break="no"> <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak> </rule> -<rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)--> +<rule break="no"> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak> </rule> <rule break="no"> -<!-- gaffa.org --> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z]{2,5}(\.|\b)</afterbreak> </rule> <!-- French abbreviations --> <rule break="no"> @@ -5361,19 +5364,19 @@ </rule> <rule break="no"> <beforebreak>\b\p{L}\.</beforebreak> <afterbreak>\p{L}\.</afterbreak> </rule> -<rule break="no"><!-- Je suis (...) Chris. --> +<rule break="no"> <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak> <afterbreak>[^\p{P}]</afterbreak> </rule> -<rule break="no"><!-- Je suis (...?) Chris. --> +<rule break="no"> <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak> <afterbreak>[^\p{P}]</afterbreak> </rule> -<rule break="no"><!-- Jones v. Smith --> +<rule break="no"> <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak> <afterbreak>\p{Lu}\p{L}+</afterbreak> </rule> <rule break="yes"> <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak> @@ -5409,88 +5412,88 @@ </rule> <rule break="no"> <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- i.e. --> -<beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence --> +<rule break="no"> +<beforebreak>i\.e\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- U.S.A (no dot at end) --> +<rule break="no"> <beforebreak>[A-Z]\.[A-Z]\.</beforebreak> <afterbreak>[A-Z]\b</afterbreak> </rule> -<rule break="no"><!-- L.A (no dot at end) --> +<rule break="no"> <beforebreak>\bL\.</beforebreak> <afterbreak>A\b</afterbreak> </rule> -<rule break="no"><!-- U.S (no dot at end) --> +<rule break="no"> <beforebreak>\bU\.</beforebreak> <afterbreak>[SK]\b</afterbreak> </rule> -<rule break="no"><!-- No. 5 --> +<rule break="no"> <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak> <afterbreak>\p{N}</afterbreak> </rule> -<rule break="no"><!-- Ph.D. --> +<rule break="no"> <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak> <afterbreak>D\.?</afterbreak> </rule> -<rule break="no"><!-- e.g. --> +<rule break="no"> <beforebreak>\be\.g\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- vs. --> +<rule break="no"> <beforebreak>\bvs\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <!--"Etc." can end the sentence, so we check for the uppercase letter after it.--> -<rule break="no"><!-- Etc. --> +<rule break="no"> <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak> <afterbreak>[^\p{Lu}]</afterbreak> </rule> -<rule break="no"><!-- BTW (by the way) --> +<rule break="no"> <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>(?i)FRITZ!</beforebreak> <afterbreak>(?i)Box</afterbreak> </rule> -<rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 --> +<rule break="no"> <beforebreak>ID.</beforebreak> <afterbreak>3|4|Buzz|Crozz</afterbreak> </rule> -<rule break="no"><!-- Ph.D. (see rule PH_D) --> +<rule break="no"> <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) --> +<rule break="no"> <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) --> +<rule break="no"> <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> -<rule break="no"><!-- B.Eng. (Bachelor of Engineering) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Eng\.?</afterbreak> </rule> -<rule break="no"><!-- LL.B. (Bachelor of Laws) --> +<rule break="no"> <beforebreak>\bLL\.[\s\u00A0]?</beforebreak> <afterbreak>[BM]\.?</afterbreak> </rule> -<rule break="no"><!-- B.Sc. (Bachelor of Science) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Sc\.?</afterbreak> </rule> -<rule break="no"><!-- B.Comp. (Bachelor of Computing) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Comp?\.?</afterbreak> </rule> -<rule break="no"><!-- B.Arch. (Bachelor of Architecture) --> +<rule break="no"> <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak> <afterbreak>Arch\.?</afterbreak> </rule> <rule break="no"> <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak> @@ -5532,11 +5535,10 @@ <rule break="yes"> <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak> <afterbreak>\p{Lu}\p{Ll}</afterbreak> </rule> </languagerule> - <languagerule languagerulename="Ukrainian"> <!-- when sentence starts with ellipsis: ...Мазій і Юхим теж. --> <rule break="no"> <beforebreak>(^|[\h])(\.\.\.|…)</beforebreak> <afterbreak>\p{Lu}</afterbreak> @@ -5545,12 +5547,12 @@ <rule break="no"> <beforebreak>\b(в|у|на|за|з|із|зі|зо)(\.\.\.|…)[\h\v]*</beforebreak> <afterbreak>\p{Lu}</afterbreak> </rule> <rule break="no"> -<beforebreak>[.!?…][\h]+</beforebreak> -<afterbreak>[\h]*([«"„“(]|[&#x2010;-&#x2015;-][\h])\p{Ll}</afterbreak> +<beforebreak>[.!?…][»“]?[\h]+</beforebreak> +<afterbreak>[\h]*([«"„“(]|[‐-―-][\h])\p{Ll}</afterbreak> </rule> <rule break="yes"> <beforebreak>\v[\h]*</beforebreak> <afterbreak>(\.\.\.|…)</afterbreak> </rule> @@ -5560,11 +5562,11 @@ <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak> </rule> <!-- various punctuation between lowercase letters --> <rule break="no"> <beforebreak>\b\p{Ll}+[.!?][\h\v]*</beforebreak> -<afterbreak>\h*(([\(«]|[\[&#x2010;-&#x2015;-][\h\v]*)?\p{Ll})</afterbreak> +<afterbreak>\h*(([\(«]|[\[‐-―-][\h\v]*)?\p{Ll})</afterbreak> </rule> <rule break="no"> <beforebreak>([\[\(]*[\]\)]*|\.\.\.|…)[\h\v]+</beforebreak> <afterbreak>[\h\v]*\p{Ll}</afterbreak> </rule> @@ -5581,11 +5583,10 @@ <!-- capital char abbreviations А. Б. В. --> <rule break="no"> <beforebreak>(^[\h\v]*|\([\h\v]*|[«„"]|(\b[А-ЯІЇЄҐACEIHOPX]\.-))[А-ЯІЇЄҐA-Z]\.[\h\v]*</beforebreak> <afterbreak></afterbreak> </rule> -<!-- І. В. Коваль, Т. 2, C. 202 --> <!-- Іван Ч. (1914 р. н.) --> <rule break="no"> <beforebreak>[\h\v][А-ЯІЇЄҐ]\.[\h\v]*</beforebreak> <afterbreak>[А-ЯІЇЄҐ]\.|[0-9]|[\h\v]*,|[\h\v]*[:«]|\([0-9]{4}</afterbreak> </rule> @@ -5604,20 +5605,20 @@ від 26 квітня 2017 р. №35 а до лютого 2020 р. — затвердити --> <rule break="no"> <beforebreak>\b([0-9]{2}|[0-9]{4})[\h\v]+р\.[\h\v]+</beforebreak> -<afterbreak>[\h\v]*[№0-9&#x2010;-&#x2015;-]</afterbreak> +<afterbreak>[\h\v]*[№0-9‐-―-]</afterbreak> </rule> <!-- річка - р. Дніпро --> <rule break="no"> <beforebreak>(?&lt;!\d[\h]*)\bр\.[\h\v]*</beforebreak> <afterbreak>[\h]*(?!(На|Але|Так?)[\h\v]+)[А-ЯІЇЄҐA-Z][^\h]</afterbreak> </rule> <!-- У травні 1949 р. Грушківський район --> <rule break="no"> -<beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[&#x2010;-&#x2015;-])*\d{4}[\h]*р\.[\h\v]*</beforebreak> +<beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[‐-―-])*\d{4}[\h]*р\.[\h\v]*</beforebreak> <afterbreak>[\v\h]*(?!(На|Але|Так?)[\h\v]+)[А-ЯІЇЄҐA-Z][^\h\v]</afterbreak> </rule> <!-- 15 вересня 1995 р. Україною було підписно --> <rule break="no"> <beforebreak>\d{1,2}[\h]+[а-яіїєґ]+[\h]\d{4}[\h]*р\.[\h\v]*</beforebreak> @@ -5633,26 +5634,31 @@ <beforebreak>\b(тис|млн|млрд|грн)\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*(\d|[КМ]Вт)</afterbreak> </rule> <!-- усталені скорочення, що не збігаються з нескороченими словами --> <rule break="no"> -<!-- unfortunately \b ignores \u0301 --> -<beforebreak>\b(укр|рос|англ|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк))\.[\h\v]*</beforebreak> +<beforebreak>\b(укр|рос|англ?|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк)?)\.[\h\v]*</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<!-- unfortunately \b ignores \u0301 --> -<beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak> +<beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<!-- unfortunately \b ignores \u0301 --> <beforebreak>\b[сС]т\.[\h\v]</beforebreak> <afterbreak>[\h]*(?!([АВУОІЄ]|На|Але|Так?)[\h\v])</afterbreak> </rule> +<!-- нар. 1945 р. | (1966 р. нар.) | 1975 — нар. Осипчук --> <rule break="no"> -<!-- no break only for дол. США --> +<beforebreak>([0-9]|[-–—])[\h\v]+нар\.[\h\v]*</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="no"> +<beforebreak>\bнар\.[\h\v]*</beforebreak> +<afterbreak>([0-9]|бл\.|арт\.)</afterbreak> +</rule> +<rule break="no"> <beforebreak>\bдол\.[\h\v]*</beforebreak> <afterbreak>США</afterbreak> </rule> <!-- п. 10 від 11.10.1933, д. Василь --> <rule break="no"> @@ -5664,11 +5670,11 @@ <beforebreak>\b(див)\.[\h\v]</beforebreak> <afterbreak>[\h\v]*[^А-ЯІЇЄҐ]</afterbreak> </rule> <!-- Верховний орган, див. Африканський національний конгрес --> <rule break="no"> -<beforebreak>[,&#x2010;-&#x2015;-][\h\v]*(див)\.[\h\v]*</beforebreak> +<beforebreak>[,‐-―-][\h\v]*(див)\.[\h\v]*</beforebreak> <afterbreak></afterbreak> </rule> <!-- скорочення в дужках: України (див. Зимові походи) --> @@ -5676,34 +5682,36 @@ <beforebreak>(\([^)]*|\[[^\]]*|,[\h\v]*)\b(див)\.[\h\v]*</beforebreak> <afterbreak></afterbreak> </rule> <!-- abbreviation with proper noun: проф. Грицько, о. Лісове --> <rule break="no"> -<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak> +<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak> </rule> <rule break="no"> +<beforebreak>(?&lt;![іи]\s+)\bдр\.[\h\v]*</beforebreak> +<afterbreak>[\h\v]*[А-ЯІЇЄҐ]</afterbreak> +</rule> +<rule break="no"> <beforebreak>\bМан\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak> </rule> <!-- смерть гр. Болтаровича, but not "9 гр." --> <rule break="no"> <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak> </rule> -<!-- арт. - артикул --> <!-- TODO: арт. - артист --> <rule break="no"> <beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*[0-9]</afterbreak> </rule> -<!-- ХІІ р., 3-6 арт. --> +<!-- ХІІ р., 3-6 арт., 2-3 тт. --> <rule break="no"> -<beforebreak>[0-9][\h\v]+арт\.[\h\v]*</beforebreak> +<beforebreak>[0-9][\h\v]+(арт|тт)\.[\h\v]*</beforebreak> <afterbreak></afterbreak> </rule> -<!-- місто, але принаймні з парою літер в назві бо є ще метри (м) --> <!-- але розбиваємо «всього 20 м. Почалося» --> <rule break="no"> <beforebreak>(?&lt;!\d[\h\v]*)\bм\.[\h\v]*</beforebreak> <afterbreak>[А-ЯІЇЄҐ][а-яіїєґ]</afterbreak> </rule> @@ -5723,14 +5731,12 @@ <afterbreak></afterbreak> </rule> <!-- статус правових держав. — Авт.). --> <rule break="no"> <beforebreak></beforebreak> -<afterbreak>[\h\v]*[&#x2010;-&#x2015;-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak> +<afterbreak>[\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak> </rule> -<!-- force the break --> -<!-- часто зустрічається крапка+U+202F+пробіл, який srx чомусь не розбиває на речення --> <!-- але лишаємо ініціали: С.\u202F Шелухин --> <rule break="yes"> <beforebreak>(?&lt;!\h[А-ЯІЇЄҐ])[.!?…]{1,3}\u202F[\h\v]+</beforebreak> <afterbreak></afterbreak> </rule> @@ -5744,14 +5750,13 @@ <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak> </rule> <!-- “Слон” (2008 р.) У минулому харків’янка --> <rule break="yes"> <beforebreak>[.!?…]['»"„“”)\]›]?[\h\v]+</beforebreak> -<afterbreak>([&#x2010;-&#x2015;-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak> +<afterbreak>([‐-―-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak> </rule> </languagerule> - <languagerule languagerulename="Belarusian"> <rule break="no"> <beforebreak>\b\d+\.\s</beforebreak> <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak> </rule> @@ -6014,15 +6019,15 @@ <beforebreak>\s\p{L}[\.!?…]\s</beforebreak> <afterbreak>\p{Lu}\p{Ll}</afterbreak> </rule> </languagerule> <languagerule languagerulename="Portuguese"> -<rule break="no"><!-- URLs without "www."--> +<rule break="no"> <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak> </rule> -<rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)--> +<rule break="no"> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak> </rule> <!-- Abbreviations that cannot finish sentences--> <rule break="no"> @@ -6513,11 +6518,11 @@ <beforebreak>[!?]{1,3}[\)\]]\s</beforebreak> <afterbreak></afterbreak> </rule> <!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."--> <rule break="no"> -<beforebreak>[\s ][IVX]+\s</beforebreak> +<beforebreak>[\s ][IVX]+\s</beforebreak> <afterbreak>[^\p{Lu}]+</afterbreak> </rule> <!--Не раздвајај у случају као "од 13. до 14. века"--> <rule break="no"> <beforebreak>\d+\.\s</beforebreak> @@ -6652,86 +6657,86 @@ <beforebreak>[\.!?]</beforebreak> <afterbreak>\S*@</afterbreak> </rule> </languagerule> <languagerule languagerulename="Arabic"> - <rule break="no"> - <beforebreak>\bwww\.</beforebreak> - <afterbreak>\w</afterbreak> - </rule> - <rule break="no"> - <beforebreak>[\[\(]*…[\]\)]* </beforebreak> - <afterbreak>\p{Ll}</afterbreak> - </rule> - <rule break="no"> - <beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="no"> - <beforebreak>[\.!?؟…]+\p{Pe} </beforebreak> - <afterbreak>\p{Ll}</afterbreak> - </rule> - <rule break="no"> - <beforebreak>[«»"”']\s*</beforebreak> - <afterbreak>\s*\p{Ll}</afterbreak> - </rule> - <rule break="no"> - <beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="no"> - <beforebreak>\b\p{L}\.\s</beforebreak> - <afterbreak>\p{L}\.\s</afterbreak> - </rule> - <rule break="no"> - <beforebreak>\b\p{L}\.</beforebreak> - <afterbreak>\p{L}\.</afterbreak> - </rule> - <rule break="yes"> - <beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak> - <afterbreak>\p{N}+\)\s</afterbreak> - </rule> - <rule break="no"> - <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak> - <afterbreak>[\p{N}\p{Ll}]</afterbreak> - </rule> - <rule break="no"> - <beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak> - <afterbreak>[^\p{Lu}]</afterbreak> - </rule> - <rule break="no"> - <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="no"> - <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="no"> - <beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="no"> - <beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="no"> - <beforebreak>\(\p{Ll}+\.\s</beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="yes"> - <beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak> - <afterbreak></afterbreak> - </rule> - <rule break="yes"> - <beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak> - <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak> - </rule> - <rule break="yes"> - <beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak> - <afterbreak>\p{Lu}\p{Ll}</afterbreak> - </rule> - </languagerule> +<rule break="no"> +<beforebreak>\bwww\.</beforebreak> +<afterbreak>\w</afterbreak> +</rule> +<rule break="no"> +<beforebreak>[\[\(]*…[\]\)]* </beforebreak> +<afterbreak>\p{Ll}</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="no"> +<beforebreak>[\.!?؟…]+\p{Pe} </beforebreak> +<afterbreak>\p{Ll}</afterbreak> +</rule> +<rule break="no"> +<beforebreak>[«»"”']\s*</beforebreak> +<afterbreak>\s*\p{Ll}</afterbreak> +</rule> +<rule break="no"> +<beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="no"> +<beforebreak>\b\p{L}\.\s</beforebreak> +<afterbreak>\p{L}\.\s</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\b\p{L}\.</beforebreak> +<afterbreak>\p{L}\.</afterbreak> +</rule> +<rule break="yes"> +<beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak> +<afterbreak>\p{N}+\)\s</afterbreak> +</rule> +<rule break="no"> +<beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak> +<afterbreak>[\p{N}\p{Ll}]</afterbreak> +</rule> +<rule break="no"> +<beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak> +<afterbreak>[^\p{Lu}]</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="no"> +<beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="no"> +<beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="no"> +<beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="no"> +<beforebreak>\(\p{Ll}+\.\s</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="yes"> +<beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak> +<afterbreak></afterbreak> +</rule> +<rule break="yes"> +<beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak> +<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak> +</rule> +<rule break="yes"> +<beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak> +<afterbreak>\p{Lu}\p{Ll}</afterbreak> +</rule> +</languagerule> </languagerules> <maprules> <languagemap languagepattern=".*" languagerulename="GeneralImportant"></languagemap> <languagemap languagepattern="[a-z]{2,3}_one" languagerulename="ByLineBreak"></languagemap> <languagemap languagepattern="[a-z]{2,3}_two" languagerulename="ByTwoLineBreaks"></languagemap>