lib/srx/segment.srx in srx-languagetool-0.13.0 vs lib/srx/segment.srx in srx-languagetool-0.14.0

- old
+ new

@@ -1105,11 +1105,19 @@ <beforebreak>: </beforebreak> <afterbreak>[—\-–] \p{Lu}</afterbreak> </rule> </languagerule> <languagerule languagerulename="English"> +<rule break="no"><!-- https://www.seven.one/ --> +<beforebreak>\b[Se]even\.</beforebreak> +<afterbreak>[Oo]ne\b</afterbreak> +</rule> <rule break="no"> +<beforebreak>\b[1-9]\.[\s\u00A0]</beforebreak> +<afterbreak>[a-z]</afterbreak> +</rule> +<rule break="no"> <beforebreak>[\u00A0\s]</beforebreak> <afterbreak>\n</afterbreak> </rule> <rule break="no"> <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak> @@ -1162,11 +1170,11 @@ <rule break="no"> <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak> <afterbreak>D\.?</afterbreak> </rule> <rule break="no"> -<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Aa]cq|[Ii]ncl?|[Ee]xcl|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak> +<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ee]xt|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Aa]cq|[Ii]ncl?|[Ee]xcl|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak> <afterbreak>[^\p{Lu}]|I</afterbreak> </rule> <rule break="no"> <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak> <afterbreak>[^\p{Lu}]|I</afterbreak> @@ -4751,18 +4759,31 @@ </rule> <rule break="yes"> <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> +<!-- unknown abbreviations inside parentheses --> +<rule break="no"> +<beforebreak>\([^\)]*\.[\s\u00A0]</beforebreak> +<afterbreak>[^\)\r\n]*\)</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\[[^\]]*\.[\s\u00A0]</beforebreak> +<afterbreak>[^\]\r\n]*\]</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\{[^\}]*\.[\s\u00A0]</beforebreak> +<afterbreak>[^\}\r\n]*\}</afterbreak> +</rule> <!-- initials: A. C. Jones. Problem: [...] d'Alfons I. Ell era [...] --> <rule break="no"> <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <!-- Abbreviations that cannot finish sentences--> <rule break="no"> -<beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak> +<beforebreak>\b(dc|inst|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <!-- Abbreviations that can finish sentences --> <rule break="no"> <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak> @@ -4813,16 +4834,16 @@ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <!-- Ellipsis: ... lowercase --> <rule break="no"> -<beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak> +<beforebreak>[^\s\u00A0](\.\.\.|…)[\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <!-- (enum...) --> <rule break="no"> -<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak> +<beforebreak>\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <!-- pero ¡ah! no estaba <rule break="no"> <beforebreak>\b¡\p{L}+![\s\u00A0]</beforebreak> @@ -4842,11 +4863,24 @@ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak> <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak> </rule> </languagerule> <languagerule languagerulename="Spanish"> +<!-- unknown abbreviations inside parentheses --> <rule break="no"> +<beforebreak>\([^\)]*\.[\s\u00A0]</beforebreak> +<afterbreak>[^\)\r\n]*\)</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\[[^\]]*\.[\s\u00A0]</beforebreak> +<afterbreak>[^\]\r\n]*\]</afterbreak> +</rule> +<rule break="no"> +<beforebreak>\{[^\}]*\.[\s\u00A0]</beforebreak> +<afterbreak>[^\}\r\n]*\}</afterbreak> +</rule> +<rule break="no"> <beforebreak>¿[^?]+:[\s\u00A0]</beforebreak> <afterbreak>.</afterbreak> </rule> <rule break="no"> <beforebreak>Yahoo![\s\u00A0]</beforebreak> @@ -4865,16 +4899,16 @@ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak> <afterbreak></afterbreak> </rule> <!-- Ellipsis: ... lowercase --> <rule break="no"> -<beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak> +<beforebreak>[^\s\u00A0](\.\.\.|…)[\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <!-- (enum...) --> <rule break="no"> -<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak> +<beforebreak>\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <!-- Abbreviations that can finish sentences --> <rule break="no"> <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak> @@ -4980,10 +5014,14 @@ <languagerule languagerulename="German"> <rule break="no"> <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak> </rule> +<rule break="no"><!-- https://www.seven.one/ --> +<beforebreak>\b[Se]even\.</beforebreak> +<afterbreak>[Oo]nes?\b</afterbreak> +</rule> <rule break="no"> <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak> <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak> </rule> <!--support simple lists in markdown style--> @@ -5089,11 +5127,11 @@ <beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak> <afterbreak></afterbreak> </rule> <!-- don't split at cases like "Friedrich II. wird auch..." --> <rule break="no"> -<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak> +<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak> <afterbreak>[^\p{Lu}]+</afterbreak> </rule> <!-- don't split at cases like "im 13. oder 14. Jahrhundert" --> <rule break="no"> <beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak> @@ -5722,11 +5760,11 @@ <beforebreak>[\h\v.]([А-ЯІЇЄҐACEIHOPX]\.-)?(?&lt;!°)[А-ЯІЇЄҐABCEIHOPX](?&lt;!(Куан[\h]+Ю|(Петр|Олександр)([аоу]|ові|ом)?[\h]+[IІ]+))\.[\h\v]*</beforebreak> <afterbreak>[А-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'’ʼ]{3}</afterbreak> </rule> <!-- Ів. Франко (але Ів Бутільє) --> <rule break="no"> -<beforebreak>(^|[\h\v])(Ів|Дж)\.[\h\v]+</beforebreak> +<beforebreak>(^|[\h\v])(Ів|Дж|Ол)\.[\h\v]+</beforebreak> <afterbreak>[А-ЯІЇЄҐA-Z]</afterbreak> </rule> <!-- Year: 2000 р.: виробила у 2018 р. 8,1 млн декалітрів від 26 квітня 2017 р. №35 @@ -5813,11 +5851,11 @@ <beforebreak>(\([^)]*|\[[^\]]*|,[\h\v]*)\b(див)\.[\h\v]*</beforebreak> <afterbreak></afterbreak> </rule> <!-- abbreviation with proper noun: проф. Грицько, о. Лісове --> <rule break="no"> -<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak> +<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп|[сС]вт)\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak> </rule> <rule break="no"> <beforebreak>(?&lt;![іи]\s+)\bдр\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*[А-ЯІЇЄҐ]</afterbreak> @@ -5831,12 +5869,12 @@ <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak> <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak> </rule> <!-- TODO: арт. - артист --> <rule break="no"> -<beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak> -<afterbreak>[\h\v]*[0-9]</afterbreak> +<beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис|[Сс]пр)\.[\h\v]*</beforebreak> +<afterbreak>[\h\v]*(№[\h\v]*)?[0-9]</afterbreak> </rule> <!-- ХІІ р., 3-6 арт., 2-3 тт. --> <rule break="no"> <beforebreak>[0-9][\h\v]+(арт|тт)\.[\h\v]*</beforebreak> <afterbreak></afterbreak> @@ -6357,11 +6395,11 @@ <beforebreak>['"“][\.!?…]['"”]\s</beforebreak> <afterbreak></afterbreak> </rule> <!-- Not break for ellipses (...) --> <rule break="no"> -<beforebreak>[^\s](\Q...\E|…)\s</beforebreak> +<beforebreak>[^\s](\.\.\.|…)\s</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <!-- z.B. "bla (...) blubb" -> without ending sentence --> <rule break="no"> <beforebreak>[\(\)\[\]]\s</beforebreak> @@ -6578,11 +6616,11 @@ <rule break="no"> <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak> +<beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\(\p{Ll}+\.\s</beforebreak> <afterbreak></afterbreak> @@ -6658,10 +6696,10 @@ <beforebreak>[!?]{1,3}[\)\]]\s</beforebreak> <afterbreak></afterbreak> </rule> <!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."--> <rule break="no"> -<beforebreak>[\s ][IVX]+\s</beforebreak> +<beforebreak>[\s ][IVX]+\s</beforebreak> <afterbreak>[^\p{Lu}]+</afterbreak> </rule> <!--Не раздвајај у случају као "од 13. до 14. века"--> <rule break="no"> <beforebreak>\d+\.\s</beforebreak>