lib/srx/segment.srx in srx-languagetool-0.1.0 vs lib/srx/segment.srx in srx-languagetool-0.2.0

- old
+ new

@@ -1366,11 +1366,11 @@ <rule break="yes"> <beforebreak>\bOK\.\s</beforebreak> <afterbreak>\p{Ll}+</afterbreak> </rule> <rule break="no"> -<beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak> +<beforebreak>[\.\s](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.\s</beforebreak><!-- not 'no'/'in', these could be abbreviations--> <afterbreak>[\p{N}\p{Ll}]</afterbreak> </rule> <rule break="no"> <beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak> <afterbreak>[^\p{Lu}]</afterbreak> @@ -1703,10 +1703,18 @@ </rule> <rule break="no"> <beforebreak>\bmax\.\s</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> +<rule break="yes"> +<beforebreak>[?!.]['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]\s</beforebreak> +<afterbreak>[A-Z][a-z]</afterbreak> +</rule> +<rule break="yes"> +<beforebreak>[?!.]\s</beforebreak> +<afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak> +</rule> </languagerule> <languagerule languagerulename="Slovak"> <rule break="no"> <beforebreak>\b(Bc|Mgr|RNDr|PharmDr|PhDr|JUDr|PaedDr|ThDr|Ing|MUDr|MDDr|MVDr|Dr|ThLic|PhD|ArtD|ThDr|Dr|DrSc|CSs|prof)\.\s</beforebreak> <afterbreak></afterbreak> @@ -4552,15 +4560,15 @@ <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak> <afterbreak></afterbreak> </rule> <!-- Abbreviations that can finish sentences --> <rule break="no"> -<beforebreak>\bs\.\s</beforebreak> +<beforebreak>\b(s|ca)\.\s</beforebreak> <afterbreak>[XIV]+\b</afterbreak> </rule> <rule break="no"> -<beforebreak>\b(min|m)\.\s</beforebreak> +<beforebreak>\b(min|m|ca)\.\s</beforebreak> <afterbreak>[0-9]+\b</afterbreak> </rule> <rule break="no"> <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.\s</beforebreak> <afterbreak>[XIV\d]+\b</afterbreak> @@ -4656,15 +4664,15 @@ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak> <afterbreak>\p{Ll}</afterbreak> </rule> <!-- Abbreviations that can finish sentences --> <rule break="no"> -<beforebreak>\bs\.\s</beforebreak> +<beforebreak>\b(s|ca)\.\s</beforebreak> <afterbreak>[XIV]+\b</afterbreak> </rule> <rule break="no"> -<beforebreak>\b(min|m)\.\s</beforebreak> +<beforebreak>\b(min|m|ca)\.\s</beforebreak> <afterbreak>[0-9]+\b</afterbreak> </rule> <rule break="no"> <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.\s</beforebreak> <afterbreak>[XIV\d]+\b</afterbreak> @@ -4860,11 +4868,11 @@ <beforebreak>[\(\)\[\]]\s</beforebreak> <afterbreak></afterbreak> </rule> <!-- don't split at cases like "Friedrich II. wird auch..." --> <rule break="no"> -<beforebreak>[\s ][IVX]+\.\s</beforebreak> +<beforebreak>[\s ][IVX]+\.\s</beforebreak> <afterbreak>[^\p{Lu}]+</afterbreak> </rule> <!-- don't split at cases like "im 13. oder 14. Jahrhundert" --> <rule break="no"> <beforebreak>\d+\.\s</beforebreak> @@ -6277,11 +6285,11 @@ <rule break="no"> <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> -<beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak> +<beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak> <afterbreak></afterbreak> </rule> <rule break="no"> <beforebreak>\(\p{Ll}+\.\s</beforebreak> <afterbreak></afterbreak> @@ -6357,10 +6365,10 @@ <beforebreak>[!?]{1,3}[\)\]]\s</beforebreak> <afterbreak></afterbreak> </rule> <!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."--> <rule break="no"> -<beforebreak>[\s ][IVX]+\s</beforebreak> +<beforebreak>[\s ][IVX]+\s</beforebreak> <afterbreak>[^\p{Lu}]+</afterbreak> </rule> <!--Не раздвајај у случају као "од 13. до 14. века"--> <rule break="no"> <beforebreak>\d+\.\s</beforebreak>