lib/srx/segment.srx in srx-languagetool-0.12.0 vs lib/srx/segment.srx in srx-languagetool-0.13.0
- old
+ new
@@ -3,11 +3,11 @@
<header segmentsubflows="yes" cascade="yes">
<formathandle type="start" include="no"></formathandle>
<formathandle type="end" include="yes"></formathandle>
<formathandle type="isolated" include="no"></formathandle>
<okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes" useIcu4JBreakRules="no" treatIsolatedCodesAsWhitespace="no"></okpsrx:options>
-<okpsrx:sample language="nl" useMappedRules="yes">Wat God buiten Christus is. 2.</okpsrx:sample>
+<okpsrx:sample language="nl" useMappedRules="yes"> ON! is een omroep.</okpsrx:sample>
<okpsrx:rangeRule></okpsrx:rangeRule>
</header>
<body>
<languagerules>
<languagerule languagerulename="Greek">
@@ -1162,11 +1162,11 @@
<rule break="no">
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
<afterbreak>D\.?</afterbreak>
</rule>
<rule break="no">
-<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Aa]cq|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
+<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Aa]cq|[Ii]ncl?|[Ee]xcl|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
<afterbreak>[^\p{Lu}]|I</afterbreak>
</rule>
<rule break="no">
<beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
<afterbreak>[^\p{Lu}]|I</afterbreak>
@@ -1271,20 +1271,20 @@
<rule break="no">
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
+<beforebreak>\bLL\.[\s\u00A0]?[BMD]\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
<afterbreak>Eng\.?</afterbreak>
</rule>
<rule break="no">
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
-<afterbreak>[BM]\.?</afterbreak>
+<afterbreak>[BMD]\.?</afterbreak>
</rule>
<rule break="no">
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
<afterbreak>Sc\.?</afterbreak>
</rule>
@@ -1327,14 +1327,18 @@
<rule break="no">
<beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
+<beforebreak>\b[cC]orp\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
+<beforebreak>\b[Rr]eg\.[\s\u00A0]</beforebreak>
+<afterbreak></afterbreak>
+</rule>
+<rule break="no">
<beforebreak>\bBros\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\bDist\.[\s\u00A0]</beforebreak>
@@ -1538,10 +1542,19 @@
<beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
<languagerule languagerulename="Dutch">
+<rule break="no">
+<beforebreak>\sart\.\s</beforebreak>
+<afterbreak>[IVX]+[ .]</afterbreak>
+</rule>
+<!--Do not break after abbreviation of type a.b.c.-->
+<rule break="no">
+<beforebreak>\s([a-z]\.){2,10}\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
<rule break="yes">
<beforebreak>[ ]is[.][ ]</beforebreak>
<afterbreak>[0-9]\.($|[ ])</afterbreak>
</rule>
<rule break="yes">
@@ -1580,11 +1593,11 @@
<rule break="no">
<beforebreak>\b(enz|etc|zat|ambt|al|ver|art|wed|lab|bv|Bros)\.\s</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<rule break="yes">
-<beforebreak>\s(la|do|del)\sMar\.\s</beforebreak>
+<beforebreak>\s(la|do|del?)\sMar\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\b(Ge?n|Ex|Le?v|Nu?m|D(eu)?t|Jo?z|Ri|R[ei]cht|Sa?m|Ko?n|Kr[on]{0,2}|Neh?|Est?|Jb|Ps|Spr?|Pr[ed]{0,2}|H(oog)?l|Je?s|Je?r|Kl(aagl)?|Ez(ech)?|Da?n|Ho?s|Jl|Am|Ob|Mc|Mi[ch]{0,2}|Nah?|Hk|Hab|Zf|[SZ]ef|Ha?g|Zc|Zach|Ma?l|Ma?t|Mk|Mar|Lk|Jh|H(an)?d|Ro?m|Kor|Ga?l|Ef|Fp|Fil|Ko|[CK]ol|Th|Th?e[s]{1,2}|Tm|Ti?t|Fm|Fil(em)?|Hb|Hebr?|Jk|Ja[ck]|Pe?tr?|Joh|Jud|Op(enb)?|Wijsh|Tob|Sir|Bar|Makk)\.\s</beforebreak>
<afterbreak></afterbreak>
@@ -1639,10 +1652,18 @@
</rule>
<rule break="no">
<beforebreak>\b(geb|[Gg]em|get|gld|id|[Ii]ncl|ind|inf|ing|intern|[Ss]ec|inz|ir|jhr|jkvr)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
+<rule break="yes">
+<beforebreak>\s(tel|red|min)\.\s</beforebreak>
+<afterbreak>[A-Z]</afterbreak>
+</rule>
+<rule break="yes">
+<beforebreak>\.(nl|be|com)\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
<rule break="no">
<beforebreak>\b(jl|jr|kr|kt|lic|ll|lt|lw|max|[Mm]evr|mi|[Mm]in|mld)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
@@ -1660,10 +1681,14 @@
<rule break="yes">
<beforebreak>\sgraden C\.\s</beforebreak>
<afterbreak>[A-Z]</afterbreak>
</rule>
<rule break="yes">
+<beforebreak>\svitamine [A-Z]\.\s</beforebreak>
+<afterbreak>[A-Z]</afterbreak>
+</rule>
+<rule break="yes">
<beforebreak>°C\.\s</beforebreak>
<afterbreak>[A-Z][a-z]</afterbreak>
</rule>
<rule break="yes">
<beforebreak>[A-Z]&[A-Z]\.\s</beforebreak>
@@ -1712,10 +1737,38 @@
<rule break="no">
<beforebreak>\b\p{L}\.\s</beforebreak>
<afterbreak>\p{L}\.\s</afterbreak>
</rule>
<rule break="no">
+<beforebreak>\set al\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
+<!--pa. as (wrong) abbrev for pag.-->
+<rule break="no">
+<beforebreak>\spa\.\s</beforebreak>
+<afterbreak>[0-9]</afterbreak>
+</rule>
+<!--op. as abbrev for opus-->
+<rule break="no">
+<beforebreak>\sop\.\s</beforebreak>
+<afterbreak>[0-9]|cit\.</afterbreak>
+</rule>
+<rule break="no">
+<beforebreak>\soa\.\s</beforebreak>
+<afterbreak>[a-z]</afterbreak>
+</rule>
+<!--al. as abbrev for alinea-->
+<rule break="no">
+<beforebreak>\sal\.\s</beforebreak>
+<afterbreak>[0-9]</afterbreak>
+</rule>
+<!--Break also when the next sentence has no capital-->
+<rule break="yes">
+<beforebreak>\s((is|op|in|af|ik|ze|om|me|je|na|nu|al|ja|VS|EU|er|we|tv|he|ga|hè|hé|TV|as|ei|SP|pc|wc|PC|IS|NS|ok|AD|OK|at|OM|cd|VN|it|EK|In|pa|AZ|up|IT|FM|VI|ui|la|CD|CV|pr|ie|cv|WW|GB|Jo|Aa|UK|HD|oa|VU))\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
+<rule break="no">
<beforebreak>\b\p{L}\.</beforebreak>
<afterbreak>\p{L}\.</afterbreak>
</rule>
<rule break="yes">
<beforebreak>\sik\.\s</beforebreak>
@@ -1735,10 +1788,14 @@
</rule>
<rule break="no">
<beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
+<rule break="yes">
+<beforebreak>\s(op)\sX\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
<rule break="no">
<beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
@@ -1769,14 +1826,22 @@
<rule break="yes">
<beforebreak>\s'[2-9][.]\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\s[A-Z].+!\s</beforebreak>
+<beforebreak>(^|\s)[A-Z].+!\s</beforebreak>
<afterbreak>[a-z]</afterbreak>
</rule>
<rule break="no">
+<beforebreak>\s[A-Z].+z\.\s</beforebreak>
+<afterbreak>[a-z]</afterbreak>
+</rule>
+<rule break="no">
+<beforebreak>\sart\.\s</beforebreak>
+<afterbreak>[0-9]</afterbreak>
+</rule>
+<rule break="no">
<beforebreak>\b(jan|mrt|mar|jun|jul|aug|sept|okt|sep|spt|nov|dec|.*opp)\.\s</beforebreak>
<afterbreak>[a-z]</afterbreak>
</rule>
<rule break="no">
<beforebreak>Groen!\s</beforebreak>
@@ -5064,15 +5129,15 @@
<beforebreak>\b(spp?)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- German abbreviations -->
<rule break="no">
-<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Dd]bzgl[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|\d+-tlg|tlg|ggfls|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2}</beforebreak>
+<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
+<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|[Dd]t|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
@@ -5092,11 +5157,11 @@
<rule break="no">
<beforebreak>\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|[Gg]em|Pat|prov|Betr|lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mio|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]{1,2}</beforebreak>
+<beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|[Ss]td?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>([A-ZÖÄÜ][a-zöäüß]+str)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
@@ -5225,11 +5290,11 @@
<beforebreak>\b[BM]\.\s?</beforebreak>
<afterbreak>Eng\.?</afterbreak>
</rule>
<rule break="no">
<beforebreak>\bLL\.\s?</beforebreak>
-<afterbreak>[BM]\.?</afterbreak>
+<afterbreak>[BMD]\.?</afterbreak>
</rule>
<rule break="no">
<beforebreak>\b[BM]\.\s?</beforebreak>
<afterbreak>Sc\.?</afterbreak>
</rule>
@@ -5524,20 +5589,20 @@
<rule break="no">
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
+<beforebreak>\bLL\.[\s\u00A0]?[BMD]\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
<afterbreak>Eng\.?</afterbreak>
</rule>
<rule break="no">
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
-<afterbreak>[BM]\.?</afterbreak>
+<afterbreak>[BMD]\.?</afterbreak>
</rule>
<rule break="no">
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
<afterbreak>Sc\.?</afterbreak>
</rule>
@@ -5589,10 +5654,18 @@
<rule break="yes">
<beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
+<languagerule languagerulename="Crimean Tatar">
+<rule break="no">
+<beforebreak>\b[0-9]+(\.|:)[0-9][0-9][\s\u00A0\u202F]</beforebreak>
+<afterbreak></afterbreak>
+</rule>
+</languagerule>
+
<languagerule languagerulename="Ukrainian">
<!-- when sentence starts with ellipsis: ...Мазій і Юхим теж. -->
<rule break="no">
<beforebreak>(^|[\h])(\.\.\.|…)</beforebreak>
<afterbreak>\p{Lu}</afterbreak>
@@ -5789,12 +5862,16 @@
<afterbreak></afterbreak>
</rule>
<!-- статус правових держав. — Авт.). -->
<rule break="no">
<beforebreak></beforebreak>
-<afterbreak>[\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak>
+<afterbreak>[\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)\.[\h\v]*[\)\]]</afterbreak>
</rule>
+<rule break="no">
+<beforebreak>\b([Рр]ед)\.[\h\v]*</beforebreak>
+<afterbreak>[А-ЯІЇЄҐ]</afterbreak>
+</rule>
<!-- Цензор.НЕТ -->
<rule break="no">
<beforebreak>[а-яіїєґ]\.</beforebreak>
<afterbreak>НЕТ|Інфо|Info|City|Life|UA|Ру</afterbreak>
</rule>
@@ -6835,9 +6912,10 @@
<languagemap languagepattern="(SV|sv).*" languagerulename="Generic"></languagemap>
<languagemap languagepattern="(LT|lt).*" languagerulename="Generic"></languagemap>
<languagemap languagepattern="(ML|ml).*" languagerulename="Generic"></languagemap>
<languagemap languagepattern="(TL|tl).*" languagerulename="Generic"></languagemap>
<languagemap languagepattern="(AST|ast).*" languagerulename="Generic"></languagemap>
+<languagemap languagepattern="(CRH|crh).*" languagerulename="Generic"></languagemap>
<languagemap languagepattern=".*" languagerulename="Default"></languagemap>
</maprules>
</body>
</srx>