lib/srx/segment.srx in srx-languagetool-0.5.0 vs lib/srx/segment.srx in srx-languagetool-0.6.0
- old
+ new
@@ -1157,11 +1157,11 @@
<rule break="no"><!-- Ph.D. -->
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
<afterbreak>D\.?</afterbreak>
</rule>
<rule break="no"><!-- min. -->
-<beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
+<beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
<afterbreak>[^\p{Lu}]|I</afterbreak>
</rule>
<rule break="no"><!-- hr. -->
<beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
<afterbreak>[^\p{Lu}]|I</afterbreak>
@@ -1185,13 +1185,17 @@
<rule break="no"><!-- e.g. -->
<beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no"><!-- vs. -->
-<beforebreak>\bvs\.[\s\u00A0]</beforebreak>
+<beforebreak>\b[Vv]s\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
+<rule break="no"><!-- pp. -->
+<beforebreak>\b(pp|PP)\.[\s\u00A0]</beforebreak>
+<afterbreak></afterbreak>
+</rule>
<rule break="no"><!-- esp. -->
<beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
@@ -1247,11 +1251,11 @@
<beforebreak>(?i)FRITZ!</beforebreak>
<afterbreak>(?i)Box</afterbreak>
</rule>
<rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
<beforebreak>ID.</beforebreak>
-<afterbreak>3|Buzz|Crozz</afterbreak>
+<afterbreak>3|4|Buzz|Crozz</afterbreak>
</rule>
<rule break="no"><!-- Ph.D. (see rule PH_D) -->
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
@@ -1548,11 +1552,11 @@
<rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
</rule>
<rule break="no">
-<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros)\.\s</beforebreak>
+<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros|Stb)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr|Jac)\.\s</beforebreak>
<afterbreak></afterbreak>
@@ -1568,10 +1572,14 @@
<rule break="no">
<beforebreak>\b(al|ald|alg|amb|ambt|anat|antrop|apoth)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
+<beforebreak>\b(alc|bro|opm|acc)\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
+<rule break="no">
<beforebreak>\b(arch|archeol|art|bc|betr|bez|bibl|bijl|bijv)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\b(bijz|blz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak>
@@ -4640,11 +4648,11 @@
<rule break="no">
<beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0]</beforebreak>
<afterbreak>[XIV\d]+\b</afterbreak>
</rule>
<rule break="no">
-<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
+<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
</rule>
<!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
<rule break="no">
<beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
@@ -4711,10 +4719,14 @@
<languagerule languagerulename="Spanish">
<rule break="no">
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
+<rule break="no">
+<beforebreak>40dB.[\s\u00A0]</beforebreak>
+<afterbreak>\p{Ll}</afterbreak>
+</rule>
<rule break="yes">
<beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
@@ -4872,11 +4884,11 @@
<beforebreak>(?i)FRITZ!</beforebreak>
<afterbreak>(?i)Box</afterbreak>
</rule>
<rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
<beforebreak>ID.</beforebreak>
-<afterbreak>3|Buzz|Crozz</afterbreak>
+<afterbreak>3|4|Buzz|Crozz</afterbreak>
</rule>
<rule break="no">
<beforebreak>[1-3]\.[\u00A0\s]</beforebreak>
<afterbreak>Liga|Bundesliga|Fußball(-B|b)undesliga</afterbreak>
</rule>
@@ -4976,11 +4988,11 @@
<beforebreak>\b(spp?)\.[\u00A0\s]</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- German abbreviations -->
<rule break="no">
-<beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|[Ee]inschl|[Vv]mtl|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]</beforebreak>
+<beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|eigl|Eigl|akt|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|Evt|evt|exkl|Expl|Exz)\.[\u00A0\s]</beforebreak>
<afterbreak></afterbreak>
@@ -5397,11 +5409,11 @@
<beforebreak>(?i)FRITZ!</beforebreak>
<afterbreak>(?i)Box</afterbreak>
</rule>
<rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
<beforebreak>ID.</beforebreak>
-<afterbreak>3|Buzz|Crozz</afterbreak>
+<afterbreak>3|4|Buzz|Crozz</afterbreak>
</rule>
<rule break="no"><!-- Ph.D. (see rule PH_D) -->
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
@@ -5593,12 +5605,12 @@
<rule break="no">
<!-- no break only for дол. США -->
<beforebreak>\bдол\.[\h\v]*</beforebreak>
<afterbreak>США</afterbreak>
</rule>
-<!-- п. 10 від 11.10.1933 -->
+<!-- п. 10 від 11.10.1933, д. Василь -->
<rule break="no">
-<beforebreak>(?<!т\.[\h\v]?)\bп\.[\h\v]*</beforebreak>
+<beforebreak>(?<!т\.[\h\v]?)\b[пд]\.[\h\v]*</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- усталені скорочення, що збігаються з нескороченими словами -->
<rule break="no">
<beforebreak>\b(див)\.[\h\v]</beforebreak>