require 'spec_helper' RSpec.describe PragmaticSegmenter::Segmenter do context "Golden Rules (English)" do it "Simple period to end sentence #001" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello World. My name is Jonas.", language: "en") expect(ps.segment).to eq(["Hello World.", "My name is Jonas."]) end it "Question mark to end sentence #002" do ps = PragmaticSegmenter::Segmenter.new(text: "What is your name? My name is Jonas.", language: "en") expect(ps.segment).to eq(["What is your name?", "My name is Jonas."]) end it "Exclamation point to end sentence #003" do ps = PragmaticSegmenter::Segmenter.new(text: "There it is! I found it.", language: "en") expect(ps.segment).to eq(["There it is!", "I found it."]) end it "One letter upper case abbreviations #004" do ps = PragmaticSegmenter::Segmenter.new(text: "My name is Jonas E. Smith.", language: "en") expect(ps.segment).to eq(["My name is Jonas E. Smith."]) end it "One letter lower case abbreviations #005" do ps = PragmaticSegmenter::Segmenter.new(text: "Please turn to p. 55.", language: "en") expect(ps.segment).to eq(["Please turn to p. 55."]) end it "Two letter lower case abbreviations in the middle of a sentence #006" do ps = PragmaticSegmenter::Segmenter.new(text: "Were Jane and co. at the party?", language: "en") expect(ps.segment).to eq(["Were Jane and co. at the party?"]) end it "Two letter upper case abbreviations in the middle of a sentence #007" do ps = PragmaticSegmenter::Segmenter.new(text: "They closed the deal with Pitt, Briggs & Co. at noon.", language: "en") expect(ps.segment).to eq(["They closed the deal with Pitt, Briggs & Co. at noon."]) end it "Two letter lower case abbreviations at the end of a sentence #008" do ps = PragmaticSegmenter::Segmenter.new(text: "Let's ask Jane and co. They should know.", language: "en") expect(ps.segment).to eq(["Let's ask Jane and co.", "They should know."]) end it "Two letter upper case abbreviations at the end of a sentence #009" do ps = PragmaticSegmenter::Segmenter.new(text: "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", language: "en") expect(ps.segment).to eq(["They closed the deal with Pitt, Briggs & Co.", "It closed yesterday."]) end it "Two letter (prepositive) abbreviations #010" do ps = PragmaticSegmenter::Segmenter.new(text: "I can see Mt. Fuji from here.", language: "en") expect(ps.segment).to eq(["I can see Mt. Fuji from here."]) end it "Two letter (prepositive & postpositive) abbreviations #011" do ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church is on 5th st. near the light.", language: "en") expect(ps.segment).to eq(["St. Michael's Church is on 5th st. near the light."]) end it "Possesive two letter abbreviations #012" do ps = PragmaticSegmenter::Segmenter.new(text: "That is JFK Jr.'s book.", language: "en") expect(ps.segment).to eq(["That is JFK Jr.'s book."]) end it "Multi-period abbreviations in the middle of a sentence #013" do ps = PragmaticSegmenter::Segmenter.new(text: "I visited the U.S.A. last year.", language: "en") expect(ps.segment).to eq(["I visited the U.S.A. last year."]) end it "Multi-period abbreviations at the end of a sentence #014" do ps = PragmaticSegmenter::Segmenter.new(text: "I live in the E.U. How about you?", language: "en") expect(ps.segment).to eq(["I live in the E.U.", "How about you?"]) end it "U.S. as sentence boundary #015" do ps = PragmaticSegmenter::Segmenter.new(text: "I live in the U.S. How about you?", language: "en") expect(ps.segment).to eq(["I live in the U.S.", "How about you?"]) end it "U.S. as non sentence boundary with next word capitalized #016" do ps = PragmaticSegmenter::Segmenter.new(text: "I work for the U.S. Government in Virginia.", language: "en") expect(ps.segment).to eq(["I work for the U.S. Government in Virginia."]) end it "U.S. as non sentence boundary #017" do ps = PragmaticSegmenter::Segmenter.new(text: "I have lived in the U.S. for 20 years.", language: "en") expect(ps.segment).to eq(["I have lived in the U.S. for 20 years."]) end xdescribe "not yet implemented" do it "A.M. / P.M. as non sentence boundary and sentence boundary #018" do ps = PragmaticSegmenter::Segmenter.new(text: "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.", language: "en") expect(ps.segment).to eq(["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."]) end end it "Number as non sentence boundary #019" do ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00 in her bag.", language: "en") expect(ps.segment).to eq(["She has $100.00 in her bag."]) end it "Number as sentence boundary #020" do ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00. It is in her bag.", language: "en") expect(ps.segment).to eq(["She has $100.00.", "It is in her bag."]) end it "Parenthetical inside sentence #021" do ps = PragmaticSegmenter::Segmenter.new(text: "He teaches science (He previously worked for 5 years as an engineer.) at the local University.", language: "en") expect(ps.segment).to eq(["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]) end it "Email addresses #022" do ps = PragmaticSegmenter::Segmenter.new(text: "Her email is Jane.Doe@example.com. I sent her an email.", language: "en") expect(ps.segment).to eq(["Her email is Jane.Doe@example.com.", "I sent her an email."]) end it "Web addresses #023" do ps = PragmaticSegmenter::Segmenter.new(text: "The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", language: "en") expect(ps.segment).to eq(["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]) end it "Single quotations inside sentence #024" do ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, 'This is great.' she said.", language: "en") expect(ps.segment).to eq(["She turned to him, 'This is great.' she said."]) end it "Double quotations inside sentence #025" do ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" she said.", language: "en") expect(ps.segment).to eq(["She turned to him, \"This is great.\" she said."]) end it "Double quotations at the end of a sentence #026" do ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" She held the book out to show him.", language: "en") expect(ps.segment).to eq(["She turned to him, \"This is great.\"", "She held the book out to show him."]) end it "Double punctuation (exclamation point) #027" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello!! Long time no see.", language: "en") expect(ps.segment).to eq(["Hello!!", "Long time no see."]) end it "Double punctuation (question mark) #028" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello?? Who is there?", language: "en") expect(ps.segment).to eq(["Hello??", "Who is there?"]) end it "Double punctuation (exclamation point / question mark) #029" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello!? Is that you?", language: "en") expect(ps.segment).to eq(["Hello!?", "Is that you?"]) end it "Double punctuation (question mark / exclamation point) #030" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello?! Is that you?", language: "en") expect(ps.segment).to eq(["Hello?!", "Is that you?"]) end it "List (period followed by parens and no period to end item) #031" do ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item 2.) The second item", language: "en") expect(ps.segment).to eq(["1.) The first item", "2.) The second item"]) end it "List (period followed by parens and period to end item) #032" do ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item. 2.) The second item.", language: "en") expect(ps.segment).to eq(["1.) The first item.", "2.) The second item."]) end it "List (parens and no period to end item) #033" do ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item 2) The second item", language: "en") expect(ps.segment).to eq(["1) The first item", "2) The second item"]) end it "List (parens and period to end item) #034" do ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item. 2) The second item.", language: "en") expect(ps.segment).to eq(["1) The first item.", "2) The second item."]) end it "List (period to mark list and no period to end item) #035" do ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item 2. The second item", language: "en") expect(ps.segment).to eq(["1. The first item", "2. The second item"]) end it "List (period to mark list and period to end item) #036" do ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item. 2. The second item.", language: "en") expect(ps.segment).to eq(["1. The first item.", "2. The second item."]) end it "List with bullet #037" do ps = PragmaticSegmenter::Segmenter.new(text: "• 9. The first item • 10. The second item", language: "en") expect(ps.segment).to eq(["• 9. The first item", "• 10. The second item"]) end it "List with hypthen #038" do ps = PragmaticSegmenter::Segmenter.new(text: "⁃9. The first item ⁃10. The second item", language: "en") expect(ps.segment).to eq(["⁃9. The first item", "⁃10. The second item"]) end it "Alphabetical list #039" do ps = PragmaticSegmenter::Segmenter.new(text: "a. The first item b. The second item c. The third list item", language: "en") expect(ps.segment).to eq(["a. The first item", "b. The second item", "c. The third list item"]) end it "Errant newlines in the middle of sentences (PDF) #040" do ps = PragmaticSegmenter::Segmenter.new(text: "This is a sentence\ncut off in the middle because pdf.", language: "en", doc_type: "pdf") expect(ps.segment).to eq(["This is a sentence cut off in the middle because pdf."]) end it "Errant newlines in the middle of sentences #041" do ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en") expect(ps.segment).to eq(["It was a cold night in the city."]) end it "Lower case list separated by newline #042" do ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: "en") expect(ps.segment).to eq(["features", "contact manager", "events, activities"]) end it "Geo Coordinates #043" do ps = PragmaticSegmenter::Segmenter.new(text: "You can find it at N°. 1026.253.553. That is where the treasure is.", language: "en") expect(ps.segment).to eq(["You can find it at N°. 1026.253.553.", "That is where the treasure is."]) end it "Named entities with an exclamation point #044" do ps = PragmaticSegmenter::Segmenter.new(text: "She works at Yahoo! in the accounting department.", language: "en") expect(ps.segment).to eq(["She works at Yahoo! in the accounting department."]) end it "I as a sentence boundary and I as an abbreviation #045" do ps = PragmaticSegmenter::Segmenter.new(text: "We make a good team, you and I. Did you see Albert I. Jones yesterday?", language: "en") expect(ps.segment).to eq(["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"]) end it "Ellipsis at end of quotation #046" do ps = PragmaticSegmenter::Segmenter.new(text: "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", language: "en") expect(ps.segment).to eq(["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"]) end it "Ellipsis with square brackets #047" do ps = PragmaticSegmenter::Segmenter.new(text: "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).", language: "en") expect(ps.segment).to eq(["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."]) end it "Ellipsis as sentence boundary (standard ellipsis rules) #048" do ps = PragmaticSegmenter::Segmenter.new(text: "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", language: "en") expect(ps.segment).to eq(["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."]) end it "Ellipsis as sentence boundary (non-standard ellipsis rules) #049" do ps = PragmaticSegmenter::Segmenter.new(text: "I never meant that.... She left the store.", language: "en") expect(ps.segment).to eq(["I never meant that....", "She left the store."]) end it "Ellipsis as non sentence boundary #050" do ps = PragmaticSegmenter::Segmenter.new(text: "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", language: "en") expect(ps.segment).to eq(["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."]) end it "4-dot ellipsis #051" do ps = PragmaticSegmenter::Segmenter.new(text: "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: "en") expect(ps.segment).to eq(["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]) end it "No whitespace in between sentences #052" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", language: "en") expect(ps.segment).to eq(["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]) end end context "Golden Rules (languages other than English)" do context "Golden Rules (German)" do it "Quotation at end of sentence #001" do ps = PragmaticSegmenter::Segmenter.new(text: "„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“ Wir haben 1.000.000 Euro.", language: "de") expect(ps.segment).to eq(["„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“", "Wir haben 1.000.000 Euro."]) end it "Abbreviations #002" do ps = PragmaticSegmenter::Segmenter.new(text: "Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist.", language: "de") expect(ps.segment).to eq(["Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist."]) end it "Numbers #003" do ps = PragmaticSegmenter::Segmenter.new(text: "Was sind die Konsequenzen der Abstimmung vom 12. Juni?", language: "de") expect(ps.segment).to eq(["Was sind die Konsequenzen der Abstimmung vom 12. Juni?"]) end end context "Golden Rules (Japanese)" do it "Simple period to end sentence #001" do ps = PragmaticSegmenter::Segmenter.new(text: "これはペンです。それはマーカーです。", language: "ja") expect(ps.segment).to eq(["これはペンです。", "それはマーカーです。"]) end it "Question mark to end sentence #002" do ps = PragmaticSegmenter::Segmenter.new(text: "それは何ですか?ペンですか?", language: "ja") expect(ps.segment).to eq(["それは何ですか?", "ペンですか?"]) end it "Exclamation point to end sentence #003" do ps = PragmaticSegmenter::Segmenter.new(text: "良かったね!すごい!", language: "ja") expect(ps.segment).to eq(["良かったね!", "すごい!"]) end it "Quotation #004" do ps = PragmaticSegmenter::Segmenter.new(text: "自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。2%台後半を目指すとする方向で最終調整に入りました。", language: "ja") expect(ps.segment).to eq(["自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。", "2%台後半を目指すとする方向で最終調整に入りました。"]) end it "Errant newlines in the middle of sentences #005" do ps = PragmaticSegmenter::Segmenter.new(text: "これは父の\n家です。", language: "ja") expect(ps.segment).to eq(["これは父の家です。"]) end end context "Golden Rules (Arabic)" do it "Regular punctuation #001" do ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: "ar") expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."]) end it "Abbreviations #002" do ps = PragmaticSegmenter::Segmenter.new(text: "وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: "ar") expect(ps.segment).to eq(["وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."]) end it "Numbers and Dates #003" do ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: "ar") expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."]) end it "Time #004" do ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: "ar") expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."]) end it "Comma #005" do ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: "ar") expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"]) end end context "Golden Rules (Italian)" do it "Abbreviations #001" do ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: "it") expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"]) end it "Quotations #002" do ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.».", language: "it") expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.»."]) end it "Numbers #003" do ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: "it") expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"]) end end context "Golden Rules (Russian)" do it "Abbreviations #001" do ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: "ru") expect(ps.segment).to eq(["Объем составляет 5 куб.м."]) end it "Quotations #002" do ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: "ru") expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."]) end it "Numbers #003" do ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: "ru") expect(ps.segment).to eq(["Сегодня 27.10.14"]) end end context "Golden Rules (Spanish)" do it "Question mark to end sentence #001" do ps = PragmaticSegmenter::Segmenter.new(text: "¿Cómo está hoy? Espero que muy bien.", language: "es") expect(ps.segment).to eq(["¿Cómo está hoy?", "Espero que muy bien."]) end it "Exclamation point to end sentence #002" do ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola señorita! Espero que muy bien.", language: "es") expect(ps.segment).to eq(["¡Hola señorita!", "Espero que muy bien."]) end it "Abbreviations #003" do ps = PragmaticSegmenter::Segmenter.new(text: "Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: "es") expect(ps.segment).to eq(["Hola Srta. Ledesma.", "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."]) end it "Numbers #004" do ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: "es") expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."]) end it "Quotations #005" do ps = PragmaticSegmenter::Segmenter.new(text: "«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles.", language: "es") expect(ps.segment).to eq(["«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles."]) end end context "Golden Rules (Greek)" do it "Question mark to end sentence #001" do ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: "el") expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."]) end end context "Golden Rules (Hindi)" do it "Full stop #001" do ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: "hi") expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"]) end end context "Golden Rules (Armenian)" do it "Sentence ending punctuation #001" do ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: "hy") expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"]) end it "Ellipsis #002" do ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: "hy") expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"]) end it "Period is not a sentence boundary #003" do ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: "hy") expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"]) end end context "Golden Rules (Burmese)" do it "Sentence ending punctuation #001" do ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my') expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"]) end end context "Golden Rules (Amharic)" do it "Sentence ending punctuation #001" do ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am') expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"]) end end context "Golden Rules (Persian)" do it "Sentence ending punctuation #001" do ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa') expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."]) end end context "Golden Rules (Urdu)" do it "Sentence ending punctuation #001" do ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur') expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]) end end end context 'Language: English (en)' do describe '#segment' do it 'correctly segments text #001' do ps = PragmaticSegmenter::Segmenter.new(text: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'\nSo she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.", language: 'en') expect(ps.segment).to eq(["Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'", "So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."]) end it 'correctly segments text #002' do ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\n'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #003' do ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\r'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #004' do ps = PragmaticSegmenter::Segmenter.new(text: "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en') expect(ps.segment).to eq(["'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #005' do ps = PragmaticSegmenter::Segmenter.new(text: "Down, down, down. Would the fall NEVER come to an end! 'I wonder how many miles I've fallen by this time?' she said aloud.", language: 'en') expect(ps.segment).to eq(["Down, down, down.", "Would the fall NEVER come to an end!", "'I wonder how many miles I've fallen by this time?' she said aloud."]) end it 'correctly segments text #006' do ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it. 'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en') expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #007' do ps = PragmaticSegmenter::Segmenter.new(text: 'A minute is a unit of measurement of time or of angle. The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1. In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second. The minute is not an SI unit; however, it is accepted for use with SI units. The symbol for minute or minutes is min. The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system. Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length.', language: 'en') expect(ps.segment).to eq(["A minute is a unit of measurement of time or of angle.", "The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1.", "In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second.", "The minute is not an SI unit; however, it is accepted for use with SI units.", "The symbol for minute or minutes is min.", "The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system.", "Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."]) end it 'correctly segments text #008' do text = <<-EOF About Me...............................................................................................5 Chapter 2 ...................................................................... 6 Three Weeks Later............................................................................ 7 Better Eating........................................................................................ 8 What's the Score?.............................................................. 9 How To Calculate the Score................... 16-17 EOF ps = PragmaticSegmenter::Segmenter.new(text: text, language: 'en') expect(ps.segment).to eq(["About Me", "Chapter 2", "Three Weeks Later", "Better Eating", "What's the Score?", "How To Calculate the Score"]) end it 'correctly segments text #009' do ps = PragmaticSegmenter::Segmenter.new(text: 'I think Jun. is a great month, said Mr. Suzuki.', language: 'en') expect(ps.segment).to eq(["I think Jun. is a great month, said Mr. Suzuki."]) end it 'correctly segments text #010' do ps = PragmaticSegmenter::Segmenter.new(text: 'Jun. is a great month, said Mr. Suzuki.', language: 'en') expect(ps.segment).to eq(["Jun. is a great month, said Mr. Suzuki."]) end it 'correctly segments text #011' do ps = PragmaticSegmenter::Segmenter.new(text: "I have 1.000.00. Yay $.50 and .50! That's 600.", language: 'en') expect(ps.segment).to eq(["I have 1.000.00.", "Yay $.50 and .50!", "That's 600."]) end it 'correctly segments text #012' do ps = PragmaticSegmenter::Segmenter.new(text: '1.) This is a list item with a parens.', language: 'en') expect(ps.segment).to eq(["1.) This is a list item with a parens."]) end it 'correctly segments text #013' do ps = PragmaticSegmenter::Segmenter.new(text: '1. This is a list item.', language: 'en') expect(ps.segment).to eq(['1. This is a list item.']) end it 'correctly segments text #014' do ps = PragmaticSegmenter::Segmenter.new(text: 'I live in the U.S.A. I went to J.C. Penney.', language: 'en') expect(ps.segment).to eq(["I live in the U.S.A.", "I went to J.C. Penney."]) end it 'correctly segments text #015' do ps = PragmaticSegmenter::Segmenter.new(text: 'His name is Alfred E. Sloan.', language: 'en') expect(ps.segment).to eq(['His name is Alfred E. Sloan.']) end it 'correctly segments text #016' do ps = PragmaticSegmenter::Segmenter.new(text: 'Q. What is his name? A. His name is Alfred E. Sloan.', language: 'en') expect(ps.segment).to eq(['Q. What is his name?', 'A. His name is Alfred E. Sloan.']) end it 'correctly segments text #017' do ps = PragmaticSegmenter::Segmenter.new(text: 'Today is 11.18.2014.', language: 'en') expect(ps.segment).to eq(['Today is 11.18.2014.']) end it 'correctly segments text #018' do ps = PragmaticSegmenter::Segmenter.new(text: 'I need you to find 3 items, e.g. a hat, a coat, and a bag.', language: 'en') expect(ps.segment).to eq(['I need you to find 3 items, e.g. a hat, a coat, and a bag.']) end it 'correctly segments text #019' do ps = PragmaticSegmenter::Segmenter.new(text: "The game is the Giants vs. the Tigers at 10 p.m. I'm going are you?", language: 'en') expect(ps.segment).to eq(["The game is the Giants vs. the Tigers at 10 p.m.", "I'm going are you?"]) end it 'correctly segments text #020' do ps = PragmaticSegmenter::Segmenter.new(text: 'He is no. 5, the shortstop.', language: 'en') expect(ps.segment).to eq(['He is no. 5, the shortstop.']) end it 'correctly segments text #021' do ps = PragmaticSegmenter::Segmenter.new(text: "Remove long strings of dots........please.", language: 'en') expect(ps.segment).to eq(["Remove long strings of dots please."]) end it 'correctly segments text #022' do ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n.\n\n\nPricing Additionl Info\n", language: 'en') expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"]) end it 'correctly segments text #023' do ps = PragmaticSegmenter::Segmenter.new(text: "As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid.", language: 'en') expect(ps.segment).to eq(["As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid."]) end it 'correctly segments text #024' do ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: 'en') expect(ps.segment).to eq(['features', 'contact manager', 'events, activities']) end it 'correctly segments text #025' do ps = PragmaticSegmenter::Segmenter.new(text: "Git rid of unnecessary white space.", language: 'en') expect(ps.segment).to eq(["Git rid of unnecessary white space."]) end it 'correctly segments text #026' do ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n. Pricing Additionl Info", language: 'en') expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"]) end it 'correctly segments text #027' do ps = PragmaticSegmenter::Segmenter.new(text: "Organising your care early \nmeans you'll have months to build a good relationship with your midwife or doctor, ready for \nthe birth.", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["Organising your care early means you'll have months to build a good relationship with your midwife or doctor, ready for the birth."]) end it 'correctly segments text #028' do ps = PragmaticSegmenter::Segmenter.new(text: "10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:"]) end it 'correctly segments text #029' do ps = PragmaticSegmenter::Segmenter.new(text: "• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["• 9. Stop smoking", "• 10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:", "1. Organise your pregnancy care early"]) end it 'correctly segments text #030' do ps = PragmaticSegmenter::Segmenter.new(text: "I have 600. How many do you have?", language: 'en') expect(ps.segment).to eq(["I have 600.", "How many do you have?"]) end it 'correctly segments text #031' do ps = PragmaticSegmenter::Segmenter.new(text: "\n3\n\nIntroduction\n\n", language: 'en') expect(ps.segment).to eq(["Introduction"]) end it 'correctly segments text #032' do ps = PragmaticSegmenter::Segmenter.new(text: "\nW\nA\nRN\nI\nNG\n", language: 'en') expect(ps.segment).to eq(["WARNING"]) end it 'correctly segments text #033' do ps = PragmaticSegmenter::Segmenter.new(text: "\n\n\nW\nA\nRN\nI\nNG\n \n/\n \nA\nV\nE\nR\nT\nI\nS\nE\nM\nE\nNT\n", language: 'en') expect(ps.segment).to eq(["WARNING", "AVERTISEMENT"]) end it 'correctly segments text #034' do ps = PragmaticSegmenter::Segmenter.new(text: '"Help yourself, sweetie," shouted Candy and gave her the cookie.', language: 'en') expect(ps.segment).to eq(["\"Help yourself, sweetie,\" shouted Candy and gave her the cookie."]) end it 'correctly segments text #035' do ps = PragmaticSegmenter::Segmenter.new(text: "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en') expect(ps.segment).to eq(["Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."]) end it 'correctly segments text #036' do ps = PragmaticSegmenter::Segmenter.new(text: "This is a test. Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en') expect(ps.segment).to eq(["This is a test.", "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."]) end it 'correctly segments text #037' do ps = PragmaticSegmenter::Segmenter.new(text: "This was because it was an offensive weapon, designed to fight at a distance up to 400 yd \n( 365.8 m ).", language: 'en') expect(ps.segment).to eq(["This was because it was an offensive weapon, designed to fight at a distance up to 400 yd ( 365.8 m )."]) end it 'correctly segments text #038' do ps = PragmaticSegmenter::Segmenter.new(text: "“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?” Others yet say: \"Should we be scared about these 'protests'?\"", language: 'en') expect(ps.segment).to eq(["“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?”", "Others yet say: \"Should we be scared about these 'protests'?\""]) end it 'correctly segments text #039' do ps = PragmaticSegmenter::Segmenter.new(text: "www.testurl.Awesome.com", language: 'en') expect(ps.segment).to eq(["www.testurl.Awesome.com"]) end it 'correctly segments text #040' do ps = PragmaticSegmenter::Segmenter.new(text: "http://testurl.Awesome.com", language: 'en') expect(ps.segment).to eq(["http://testurl.Awesome.com"]) end it 'correctly segments text #041' do ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church in is a church.", language: 'en') expect(ps.segment).to eq(["St. Michael's Church in is a church."]) end it 'correctly segments text #042' do ps = PragmaticSegmenter::Segmenter.new(text: "JFK Jr.'s book is on sale.", language: 'en') expect(ps.segment).to eq(["JFK Jr.'s book is on sale."]) end it 'correctly segments text #043' do ps = PragmaticSegmenter::Segmenter.new(text: "This is e.g. Mr. Smith, who talks slowly... And this is another sentence.", language: 'en') expect(ps.segment).to eq(["This is e.g. Mr. Smith, who talks slowly...", "And this is another sentence."]) end it 'correctly segments text #044' do ps = PragmaticSegmenter::Segmenter.new(text: "Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.", language: 'en') expect(ps.segment).to eq(["Leave me alone!, he yelled.", "I am in the U.S. Army.", "Charles (Ind.) said he."]) end it 'correctly segments text #045' do ps = PragmaticSegmenter::Segmenter.new(text: "This is the U.S. Senate my friends. Yes. It is!", language: 'en') expect(ps.segment).to eq(["This is the U.S. Senate my friends.", "Yes.", "It is!"]) end it 'correctly segments text #046' do ps = PragmaticSegmenter::Segmenter.new(text: "Send it to P.O. box 6554", language: 'en') expect(ps.segment).to eq(["Send it to P.O. box 6554"]) end it 'correctly segments text #047' do ps = PragmaticSegmenter::Segmenter.new(text: "There were 500 cases in the U.S. The U.S. Commission asked the U.S. Government to give their opinion on the issue.", language: 'en') expect(ps.segment).to eq(["There were 500 cases in the U.S.", "The U.S. Commission asked the U.S. Government to give their opinion on the issue."]) end it 'correctly segments text #048' do ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en') expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"]) end it 'correctly segments text #049' do ps = PragmaticSegmenter::Segmenter.new(text: "Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990. `So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna. Later, he recalls the words of his Marxist mentor: `The people! Theft! The holy fire!'", language: 'en') expect(ps.segment).to eq(["Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990.", "'So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna.", "Later, he recalls the words of his Marxist mentor: 'The people! Theft! The holy fire!'"]) end it 'correctly segments text #050' do ps = PragmaticSegmenter::Segmenter.new(text: "He climbed Mt. Fuji.", language: 'en') expect(ps.segment).to eq(["He climbed Mt. Fuji."]) end it 'correctly segments text #051' do ps = PragmaticSegmenter::Segmenter.new(text: "He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun.", language: 'en') expect(ps.segment).to eq(["He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun."]) end it 'correctly segments text #052' do ps = PragmaticSegmenter::Segmenter.new(text: "Test strange period.Does it segment correctly.", language: 'en') expect(ps.segment).to eq(["Test strange period.", "Does it segment correctly."]) end it 'correctly segments text #053' do ps = PragmaticSegmenter::Segmenter.new(text: "

Hello

\n

This is a test. Another test.

\n

\n\n

", language: 'en') expect(ps.segment).to eq(["Hello", "This is a test.", "Another test."]) end it 'correctly segments text #054' do ps = PragmaticSegmenter::Segmenter.new(text: "This sentence ends with the psuedo-number x10. This one with the psuedo-number %3.00. One last sentence.", language: 'en') expect(ps.segment).to eq(["This sentence ends with the psuedo-number x10.", "This one with the psuedo-number %3.00.", "One last sentence."]) end it 'correctly segments text #055' do ps = PragmaticSegmenter::Segmenter.new(text: "Testing mixed numbers Jahr10. And another 0.3 %11. That's weird.", language: 'en') expect(ps.segment).to eq(["Testing mixed numbers Jahr10.", "And another 0.3 %11.", "That's weird."]) end it 'correctly segments text #056' do ps = PragmaticSegmenter::Segmenter.new(text: "Were Jane and co. at the party?", language: 'en') expect(ps.segment).to eq(["Were Jane and co. at the party?"]) end it 'correctly segments text #057' do ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church is on 5th st. near the light.", language: 'en') expect(ps.segment).to eq(["St. Michael's Church is on 5th st. near the light."]) end it 'correctly segments text #058' do ps = PragmaticSegmenter::Segmenter.new(text: "Let's ask Jane and co. They should know.", language: 'en') expect(ps.segment).to eq(["Let's ask Jane and co.", "They should know."]) end it 'correctly segments text #059' do ps = PragmaticSegmenter::Segmenter.new(text: "He works at Yahoo! and Y!J.", language: 'en') expect(ps.segment).to eq(["He works at Yahoo! and Y!J."]) end it 'correctly segments text #060' do ps = PragmaticSegmenter::Segmenter.new(text: 'The Scavenger Hunt ends on Dec. 31st, 2011.', language: 'en') expect(ps.segment).to eq(['The Scavenger Hunt ends on Dec. 31st, 2011.']) end it 'correctly segments text #061' do ps = PragmaticSegmenter::Segmenter.new(text: "Putter King Scavenger Hunt Trophy\n(6 3/4\" Engraved Crystal Trophy - Picture Coming Soon)\nThe Putter King team will judge the scavenger hunt and all decisions will be final. The scavenger hunt is open to anyone and everyone. The scavenger hunt ends on Dec. 31st, 2011.", language: 'en') expect(ps.segment).to eq(["Putter King Scavenger Hunt Trophy", "(6 3/4\" Engraved Crystal Trophy - Picture Coming Soon)", "The Putter King team will judge the scavenger hunt and all decisions will be final.", "The scavenger hunt is open to anyone and everyone.", "The scavenger hunt ends on Dec. 31st, 2011."]) end it 'correctly segments text #062' do ps = PragmaticSegmenter::Segmenter.new(text: "Unauthorized modifications, alterations or installations of or to this equipment are prohibited and are in violation of AR 750-10. Any such unauthorized modifications, alterations or installations could result in death, injury or damage to the equipment.", language: 'en') expect(ps.segment).to eq(["Unauthorized modifications, alterations or installations of or to this equipment are prohibited and are in violation of AR 750-10.", "Any such unauthorized modifications, alterations or installations could result in death, injury or damage to the equipment."]) end it 'correctly segments text #063' do ps = PragmaticSegmenter::Segmenter.new(text: "Header 1.2; Attachment Z\n\n\td. Compliance Log – Volume 12 \n\tAttachment A\n\n\te. Additional Logistics Data\n\tSection 10", language: 'en') expect(ps.segment).to eq(["Header 1.2; Attachment Z", "d. Compliance Log – Volume 12", "Attachment A", "e. Additional Logistics Data", "Section 10"]) end it 'correctly segments text #064' do ps = PragmaticSegmenter::Segmenter.new(text: "a.) The first item b.) The second item c.) The third list item", language: 'en') expect(ps.segment).to eq(["a.) The first item", "b.) The second item", "c.) The third list item"]) end it 'correctly segments text #065' do ps = PragmaticSegmenter::Segmenter.new(text: "a) The first item b) The second item c) The third list item", language: 'en') expect(ps.segment).to eq(["a) The first item", "b) The second item", "c) The third list item"]) end it 'correctly segments text #066' do ps = PragmaticSegmenter::Segmenter.new(text: "Hello Wolrd. Here is a secret code AS750-10. Another sentence. Finally, this. 1. The first item 2. The second item 3. The third list item 4. Hello 5. Hello 6. Hello 7. Hello 8. Hello 9. Hello 10. Hello 11. Hello", language: 'en') expect(ps.segment).to eq(["Hello Wolrd.", "Here is a secret code AS750-10.", "Another sentence.", "Finally, this.", "1. The first item", "2. The second item", "3. The third list item", "4. Hello", "5. Hello", "6. Hello", "7. Hello", "8. Hello", "9. Hello", "10. Hello", "11. Hello"]) end it 'correctly segments text #067' do ps = PragmaticSegmenter::Segmenter.new(text: "He works for ABC Ltd. and sometimes for BCD Ltd. She works for ABC Co. and BCD Co. They work for ABC Corp. and BCD Corp.", language: 'en') expect(ps.segment).to eq(["He works for ABC Ltd. and sometimes for BCD Ltd.", "She works for ABC Co. and BCD Co.", "They work for ABC Corp. and BCD Corp."]) end it 'correctly segments text #068' do ps = PragmaticSegmenter::Segmenter.new(text: "<b>J1.txt</b>", language: 'en') expect(ps.segment).to eq(["J1.txt"]) end it 'correctly segments text #069' do ps = PragmaticSegmenter::Segmenter.new(text: "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.", language: 'en') expect(ps.segment).to eq(["On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S.", "Millions attended the Inauguration."]) end it 'correctly segments text #070' do ps = PragmaticSegmenter::Segmenter.new(text: "The U.K. Panel on enivronmental issues said it was true. Finally he left the U.K. He went to a new location.", language: 'en') expect(ps.segment).to eq(["The U.K. Panel on enivronmental issues said it was true.", "Finally he left the U.K.", "He went to a new location."]) end it 'correctly segments text #071' do ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 P.M. Travelers who didn't get the warning at 5 P.M. left later.", language: 'en') expect(ps.segment).to eq(["He left at 6 P.M.", "Travelers who didn't get the warning at 5 P.M. left later."]) end it 'correctly segments text #072' do ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 a.m. Travelers who didn't get the warning at 5 a.m. left later.", language: 'en') expect(ps.segment).to eq(["He left at 6 a.m.", "Travelers who didn't get the warning at 5 a.m. left later."]) end it 'correctly segments text #073' do ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 A.M. Travelers who didn't get the warning at 5 A.M. left later.", language: 'en') expect(ps.segment).to eq(["He left at 6 A.M.", "Travelers who didn't get the warning at 5 A.M. left later."]) end it 'correctly segments text #074' do ps = PragmaticSegmenter::Segmenter.new(text: "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item \rIt was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”. \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: nil) expect(ps.segment).to eq(["Hello World.", "My name is Jonas.", "What is your name?", "My name is Jonas.", "There it is!", "I found it.", "My name is Jonas E. Smith.", "Please turn to p. 55.", "Were Jane and co. at the party?", "They closed the deal with Pitt, Briggs & Co. at noon.", "Let's ask Jane and co.", "They should know.", "They closed the deal with Pitt, Briggs & Co.", "It closed yesterday.", "I can see Mt. Fuji from here.", "St. Michael's Church is on 5th st. near the light.", "That is JFK Jr.'s book.", "I visited the U.S.A. last year.", "I live in the E.U.", "How about you?", "I live in the U.S.", "How about you?", "I work for the U.S. Government in Virginia.", "I have lived in the U.S. for 20 years.", "She has $100.00 in her bag.", "She has $100.00.", "It is in her bag.", "He teaches science (He previously worked for 5 years as an engineer.) at the local University.", "Her email is Jane.Doe@example.com.", "I sent her an email.", "The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out.", "She turned to him, 'This is great.' she said.", "She turned to him, \"This is great.\" she said.", "She turned to him, \"This is great.\"", "She held the book out to show him.", "Hello!!", "Long time no see.", "Hello??", "Who is there?", "Hello!?", "Is that you?", "Hello?!", "Is that you?", "1.) The first item", "2.) The second item", "1.) The first item.", "2.) The second item.", "1) The first item", "2) The second item", "1) The first item.", "2) The second item.", "1. The first item", "2. The second item", "1. The first item.", "2. The second item.", "• 9. The first item", "• 10. The second item", "⁃9. The first item", "⁃10. The second item", "a. The first item", "b. The second item", "c. The third list item", "It was a cold night in the city.", "features", "contact manager", "events, activities", "You can find it at N°. 1026.253.553.", "That is where the treasure is.", "She works at Yahoo! in the accounting department.", "We make a good team, you and I.", "Did you see Albert I. Jones yesterday?", "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”.", "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).", "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence.", "I never meant that....", "She left the store.", "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]) end it 'correctly segments text #075' do ps = PragmaticSegmenter::Segmenter.new(text: "His name is Mark E. Smith. a. here it is b. another c. one more\n They went to the store. It was John A. Smith. She was Jane B. Smith.", language: "en") expect(ps.segment).to eq(["His name is Mark E. Smith.", "a. here it is", "b. another", "c. one more", "They went to the store.", "It was John A. Smith.", "She was Jane B. Smith."]) end it 'correctly segments text #076' do ps = PragmaticSegmenter::Segmenter.new(text: "a) here it is b) another c) one more\n They went to the store. w) hello x) hello y) hello", language: "en") expect(ps.segment).to eq(["a) here it is", "b) another", "c) one more", "They went to the store.", "w) hello", "x) hello", "y) hello"]) end it 'correctly segments text #077' do ps = PragmaticSegmenter::Segmenter.new(text: "Hello{b^>1<b^} hello{b^>1 ["Hello world.", "My name is Armine."] ps = PragmaticSegmenter::Segmenter.new(text: "Բարև Ձեզ: Իմ անունն էԱրմինե:", language: 'hy') expect(ps.segment).to eq(["Բարև Ձեզ:", "Իմ անունն էԱրմինե:"]) end it 'correctly segments text #005' do # "Today is Monday. I am going to work." ==> ["Today is Monday.", "I am going to work."] ps = PragmaticSegmenter::Segmenter.new(text: "Այսօր երկուշաբթի է: Ես գնում եմ աշխատանքի:", language: 'hy') expect(ps.segment).to eq(["Այսօր երկուշաբթի է:", "Ես գնում եմ աշխատանքի:"]) end it 'correctly segments text #006' do # "Tomorrow is September 1st. We are going to school." ==> ["Tomorrow is September 1st.", "We are going to school."] ps = PragmaticSegmenter::Segmenter.new(text: "Վաղը սեպտեմբերի 1-ն է: Մենք գնում ենք դպրոց:", language: 'hy') expect(ps.segment).to eq(["Վաղը սեպտեմբերի 1-ն է:", "Մենք գնում ենք դպրոց:"]) end it 'correctly segments text #007' do # "Yes, I understood. I really love you." ==> ["Yes, I understood.", "I really love you."] ps = PragmaticSegmenter::Segmenter.new(text: "Այո, ես հասկացա: Ես իսկապես քեզ սիրում եմ:", language: 'hy') expect(ps.segment).to eq(["Այո, ես հասկացա:", "Ես իսկապես քեզ սիրում եմ:"]) end it 'correctly segments text #008' do # "Close the windows. It is raining in the evening." ==> ["Close the windows.", "It is raining in the evening."] ps = PragmaticSegmenter::Segmenter.new(text: "Փակիր պատուհանները: Երեկոյան անձրևում է:", language: 'hy') expect(ps.segment).to eq(["Փակիր պատուհանները:", "Երեկոյան անձրևում է:"]) end it 'correctly segments text #009' do # "It is dark. I should go home." ==> ["It is dark.", "I should go home."] ps = PragmaticSegmenter::Segmenter.new(text: "Մութ է: Ես պետք է տուն վերադառնամ:", language: 'hy') expect(ps.segment).to eq(["Մութ է:", "Ես պետք է տուն վերադառնամ:"]) end it 'correctly segments text #010' do # "You know, I am starting to believe. Everything is changing." ==> ["You know, I am starting to believe.", "Everything is changing."] ps = PragmaticSegmenter::Segmenter.new(text: "Գիտես, սկսել եմ հավատալ: Ամեն ինչ փոխվում է:", language: 'hy') expect(ps.segment).to eq(["Գիտես, սկսել եմ հավատալ:", "Ամեն ինչ փոխվում է:"]) end it 'correctly segments text #011' do # "It is a new Christmas tree. We should decorate it." ==> ["It is a new Christmas tree.", "We should decorate it."] ps = PragmaticSegmenter::Segmenter.new(text: "Տոնածառը նոր է: Պետք է այն զարդարել:", language: 'hy') expect(ps.segment).to eq(["Տոնածառը նոր է:", "Պետք է այն զարդարել:"]) end it 'correctly segments text #012' do # "I am in hurry. I could not wait you." ==> ["I am in hurry.", "I could not wait you."] ps = PragmaticSegmenter::Segmenter.new(text: "Ես շտապում եմ: Ես քեզ չեմ կարող սպասել:", language: 'hy') expect(ps.segment).to eq(["Ես շտապում եմ:", "Ես քեզ չեմ կարող սպասել:"]) end it 'correctly segments text #013' do # "Wait, we love each other. I want us to live together." ==> ["Wait, we love each other.", "I want us to live together."] ps = PragmaticSegmenter::Segmenter.new(text: "Սպասիր, մենք իրար սիրում ենք: Ցանկանում եմ միասին ապրենք:", language: 'hy') expect(ps.segment).to eq(["Սպասիր, մենք իրար սիրում ենք:", "Ցանկանում եմ միասին ապրենք:"]) end it 'correctly segments text #014' do # "No, I do not think so. It is not true." ==> ["No, I do not think so.", "It is not true."] ps = PragmaticSegmenter::Segmenter.new(text: "Ոչ, այդպես չեմ կարծում: Դա ճիշտ չէ:", language: 'hy') expect(ps.segment).to eq(["Ոչ, այդպես չեմ կարծում:", "Դա ճիշտ չէ:"]) end it 'correctly segments text #015' do # "April 24 it has started to rain... I was thinking about." ==> ["April 24 it has started to rain... I was thinking about."] ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: 'hy') expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"]) end it 'correctly segments text #016' do # "It was 1960...it was winter...it was night. It was cold...emptiness." ==> ["It was 1960...it was winter...it was night.", "It was cold...emptiness."] ps = PragmaticSegmenter::Segmenter.new(text: "1960 թվական…ձմեռ…գիշեր: Սառն էր…դատարկություն:", language: 'hy') expect(ps.segment).to eq(["1960 թվական…ձմեռ…գիշեր:", "Սառն էր…դատարկություն:"]) end it 'correctly segments text #017' do # "Why a computer could not do what a man could do? Simply it doesn't have a human brain." ==> ["Why a computer could not do what a man could do?", "Simply it doesn't have a human brain."] ps = PragmaticSegmenter::Segmenter.new(text: "Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը: Պարզապես չունի մարդկային ուղեղ:", language: 'hy') expect(ps.segment).to eq(["Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը:", "Պարզապես չունի մարդկային ուղեղ:"]) end it 'correctly segments text #018' do # "Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity." ==> ["Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity."] ps = PragmaticSegmenter::Segmenter.new(text: "Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:", language: 'hy') expect(ps.segment).to eq(["Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:"]) end it 'correctly segments text #019' do # "So, we are coming to the end. The logic is...simplicity and work" ==> ["So, we are coming to the end.", "Simplicity and work."] ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: 'hy') expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"]) end it 'correctly segments text #020' do # "What are you thinking? Nothing!" ==> ["What are you thinking?", "Nothing!"] ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: 'hy') expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"]) end it 'correctly segments text #021' do # "Can we work together ?. May be what you are thinking, is possible." ==> ["Can we work together?.", "May be what you are thinking is possible."] ps = PragmaticSegmenter::Segmenter.new(text: "Կարող ե՞նք միասին աշխատել: Գուցե այն ինչ մտածում ես, իրականանալի է:", language: 'hy') expect(ps.segment).to eq(["Կարող ե՞նք միասին աշխատել:", "Գուցե այն ինչ մտածում ես, իրականանալի է:"]) end it 'correctly segments text #022' do # "Now what we have started, comes to the end. However the questions are numerous... ." ==> ["Now what we have started, comes to the end.", "However the questions are numerous... ."] ps = PragmaticSegmenter::Segmenter.new(text: "Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում: Հարցերը սակայն շատ են...:", language: 'hy') expect(ps.segment).to eq(["Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում:", "Հարցերը սակայն շատ են...:"]) end it 'correctly segments text #023' do # "Honey... I am waiting. Shall I go... or?" ==> ["Honey... I am waiting.", "Shall I go... or?"] ps = PragmaticSegmenter::Segmenter.new(text: "Սիրելիս...սպասում եմ: Գնամ թ՟ե …:", language: 'hy') expect(ps.segment).to eq(["Սիրելիս...սպասում եմ:", "Գնամ թ՟ե …:"]) end end end context 'Language: Burmese (my)' do describe '#segment' do it 'correctly segments text #001' do ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my') expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"]) end end end context 'Language: Amharic (am)' do describe '#segment' do it 'correctly segments text #001' do ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am') expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"]) end end end context 'Language: Persian (fa)' do describe '#segment' do it 'correctly segments text #001' do ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa') expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."]) end end end context 'Language: Urdu (ur)' do describe '#segment' do it 'correctly segments text #001' do ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur') expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]) end end end context 'Language: Chinese (zh)' do describe '#segment' do it 'correctly segments text #001' do ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", language: 'zh') expect(ps.segment).to eq(["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"]) end end end context 'miscellaneous tests' do describe '#segment' do it 'handles nil' do ps = PragmaticSegmenter::Segmenter.new(text: nil) expect(ps.segment).to eq([]) end it 'handles no language' do ps = PragmaticSegmenter::Segmenter.new(text: 'Hello world. Hello.') expect(ps.segment).to eq(["Hello world.", "Hello."]) end it 'handles empty strings' do ps = PragmaticSegmenter::Segmenter.new(text: "\n") expect(ps.segment).to eq([]) end it 'handles empty strings' do ps = PragmaticSegmenter::Segmenter.new(text: "") expect(ps.segment).to eq([]) end it 'handles empty strings' do ps = PragmaticSegmenter::Segmenter.new(text: '') expect(ps.segment).to eq([]) end it 'has an option to not use the cleaner' do ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en", clean: false) expect(ps.segment).to eq(["It was a cold", "night in the city."]) end it 'does not mutate the input string' do text = "It was a cold \nnight in the city." PragmaticSegmenter::Segmenter.new(text: text, language: "en").segment expect(text).to eq("It was a cold \nnight in the city.") end end describe '#clean' do it 'cleans the text' do ps = PragmaticSegmenter::Cleaner.new(text: "It was a cold \nnight in the city.", language: "en") expect(ps.clean).to eq("It was a cold night in the city.") end it 'does not mutate the input string (cleaner)' do text = "It was a cold \nnight in the city." PragmaticSegmenter::Cleaner.new(text: text, language: "en").clean expect(text).to eq("It was a cold \nnight in the city.") end end end end