lib/srx/segment.srx in srx-languagetool-0.13.0 vs lib/srx/segment.srx in srx-languagetool-0.14.0
- old
+ new
@@ -1105,11 +1105,19 @@
<beforebreak>: </beforebreak>
<afterbreak>[—\-–] \p{Lu}</afterbreak>
</rule>
</languagerule>
<languagerule languagerulename="English">
+<rule break="no"><!-- https://www.seven.one/ -->
+<beforebreak>\b[Se]even\.</beforebreak>
+<afterbreak>[Oo]ne\b</afterbreak>
+</rule>
<rule break="no">
+<beforebreak>\b[1-9]\.[\s\u00A0]</beforebreak>
+<afterbreak>[a-z]</afterbreak>
+</rule>
+<rule break="no">
<beforebreak>[\u00A0\s]</beforebreak>
<afterbreak>\n</afterbreak>
</rule>
<rule break="no">
<beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
@@ -1162,11 +1170,11 @@
<rule break="no">
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
<afterbreak>D\.?</afterbreak>
</rule>
<rule break="no">
-<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Aa]cq|[Ii]ncl?|[Ee]xcl|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
+<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ee]xt|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Aa]cq|[Ii]ncl?|[Ee]xcl|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
<afterbreak>[^\p{Lu}]|I</afterbreak>
</rule>
<rule break="no">
<beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
<afterbreak>[^\p{Lu}]|I</afterbreak>
@@ -4751,18 +4759,31 @@
</rule>
<rule break="yes">
<beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
+<!-- unknown abbreviations inside parentheses -->
+<rule break="no">
+<beforebreak>\([^\)]*\.[\s\u00A0]</beforebreak>
+<afterbreak>[^\)\r\n]*\)</afterbreak>
+</rule>
+<rule break="no">
+<beforebreak>\[[^\]]*\.[\s\u00A0]</beforebreak>
+<afterbreak>[^\]\r\n]*\]</afterbreak>
+</rule>
+<rule break="no">
+<beforebreak>\{[^\}]*\.[\s\u00A0]</beforebreak>
+<afterbreak>[^\}\r\n]*\}</afterbreak>
+</rule>
<!-- initials: A. C. Jones. Problem: [...] d'Alfons I. Ell era [...] -->
<rule break="no">
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- Abbreviations that cannot finish sentences-->
<rule break="no">
-<beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
+<beforebreak>\b(dc|inst|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- Abbreviations that can finish sentences -->
<rule break="no">
<beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
@@ -4813,16 +4834,16 @@
<beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- Ellipsis: ... lowercase -->
<rule break="no">
-<beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
+<beforebreak>[^\s\u00A0](\.\.\.|…)[\s\u00A0]</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- (enum...) -->
<rule break="no">
-<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
+<beforebreak>\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- pero ¡ah! no estaba
<rule break="no">
<beforebreak>\b¡\p{L}+![\s\u00A0]</beforebreak>
@@ -4842,11 +4863,24 @@
<beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
<afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
</rule>
</languagerule>
<languagerule languagerulename="Spanish">
+<!-- unknown abbreviations inside parentheses -->
<rule break="no">
+<beforebreak>\([^\)]*\.[\s\u00A0]</beforebreak>
+<afterbreak>[^\)\r\n]*\)</afterbreak>
+</rule>
+<rule break="no">
+<beforebreak>\[[^\]]*\.[\s\u00A0]</beforebreak>
+<afterbreak>[^\]\r\n]*\]</afterbreak>
+</rule>
+<rule break="no">
+<beforebreak>\{[^\}]*\.[\s\u00A0]</beforebreak>
+<afterbreak>[^\}\r\n]*\}</afterbreak>
+</rule>
+<rule break="no">
<beforebreak>¿[^?]+:[\s\u00A0]</beforebreak>
<afterbreak>.</afterbreak>
</rule>
<rule break="no">
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
@@ -4865,16 +4899,16 @@
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- Ellipsis: ... lowercase -->
<rule break="no">
-<beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
+<beforebreak>[^\s\u00A0](\.\.\.|…)[\s\u00A0]</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- (enum...) -->
<rule break="no">
-<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
+<beforebreak>\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- Abbreviations that can finish sentences -->
<rule break="no">
<beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
@@ -4980,10 +5014,14 @@
<languagerule languagerulename="German">
<rule break="no">
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
</rule>
+<rule break="no"><!-- https://www.seven.one/ -->
+<beforebreak>\b[Se]even\.</beforebreak>
+<afterbreak>[Oo]nes?\b</afterbreak>
+</rule>
<rule break="no">
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
</rule>
<!--support simple lists in markdown style-->
@@ -5089,11 +5127,11 @@
<beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- don't split at cases like "Friedrich II. wird auch..." -->
<rule break="no">
-<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak>
+<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak>[^\p{Lu}]+</afterbreak>
</rule>
<!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
<rule break="no">
<beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak>
@@ -5722,11 +5760,11 @@
<beforebreak>[\h\v.]([А-ЯІЇЄҐACEIHOPX]\.-)?(?<!°)[А-ЯІЇЄҐABCEIHOPX](?<!(Куан[\h]+Ю|(Петр|Олександр)([аоу]|ові|ом)?[\h]+[IІ]+))\.[\h\v]*</beforebreak>
<afterbreak>[А-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'’ʼ]{3}</afterbreak>
</rule>
<!-- Ів. Франко (але Ів Бутільє) -->
<rule break="no">
-<beforebreak>(^|[\h\v])(Ів|Дж)\.[\h\v]+</beforebreak>
+<beforebreak>(^|[\h\v])(Ів|Дж|Ол)\.[\h\v]+</beforebreak>
<afterbreak>[А-ЯІЇЄҐA-Z]</afterbreak>
</rule>
<!-- Year: 2000 р.:
виробила у 2018 р. 8,1 млн декалітрів
від 26 квітня 2017 р. №35
@@ -5813,11 +5851,11 @@
<beforebreak>(\([^)]*|\[[^\]]*|,[\h\v]*)\b(див)\.[\h\v]*</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
<rule break="no">
-<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
+<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп|[сС]вт)\.[\h\v]*</beforebreak>
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
</rule>
<rule break="no">
<beforebreak>(?<![іи]\s+)\bдр\.[\h\v]*</beforebreak>
<afterbreak>[\h\v]*[А-ЯІЇЄҐ]</afterbreak>
@@ -5831,12 +5869,12 @@
<beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
</rule>
<!-- TODO: арт. - артист -->
<rule break="no">
-<beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
-<afterbreak>[\h\v]*[0-9]</afterbreak>
+<beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис|[Сс]пр)\.[\h\v]*</beforebreak>
+<afterbreak>[\h\v]*(№[\h\v]*)?[0-9]</afterbreak>
</rule>
<!-- ХІІ р., 3-6 арт., 2-3 тт. -->
<rule break="no">
<beforebreak>[0-9][\h\v]+(арт|тт)\.[\h\v]*</beforebreak>
<afterbreak></afterbreak>
@@ -6357,11 +6395,11 @@
<beforebreak>['"“][\.!?…]['"”]\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- Not break for ellipses (...) -->
<rule break="no">
-<beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
+<beforebreak>[^\s](\.\.\.|…)\s</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- z.B. "bla (...) blubb" -> without ending sentence -->
<rule break="no">
<beforebreak>[\(\)\[\]]\s</beforebreak>
@@ -6578,11 +6616,11 @@
<rule break="no">
<beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak>
+<beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\(\p{Ll}+\.\s</beforebreak>
<afterbreak></afterbreak>
@@ -6658,10 +6696,10 @@
<beforebreak>[!?]{1,3}[\)\]]\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."-->
<rule break="no">
-<beforebreak>[\s ][IVX]+\s</beforebreak>
+<beforebreak>[\s ][IVX]+\s</beforebreak>
<afterbreak>[^\p{Lu}]+</afterbreak>
</rule>
<!--Не раздвајај у случају као "од 13. до 14. века"-->
<rule break="no">
<beforebreak>\d+\.\s</beforebreak>