require 'spec_helper' RSpec.describe PragmaticSegmenter::Languages::Danish, "(da)" do context "Golden Rules" do it "Simple period to end sentence #001" do ps = PragmaticSegmenter::Segmenter.new(text: "Hej Verden. Mit navn er Jonas.", language: "da") expect(ps.segment).to eq(["Hej Verden.", "Mit navn er Jonas."]) end it "Question mark to end sentence #002" do ps = PragmaticSegmenter::Segmenter.new(text: "Hvad er dit navn? Mit nav er Jonas.", language: "da") expect(ps.segment).to eq(["Hvad er dit navn?", "Mit nav er Jonas."]) end it "Exclamation point to end sentence #003" do ps = PragmaticSegmenter::Segmenter.new(text: "There it is! I found it.", language: "da") expect(ps.segment).to eq(["There it is!", "I found it."]) end it "One letter upper case abbreviations #004" do ps = PragmaticSegmenter::Segmenter.new(text: "My name is Jonas E. Smith.", language: "da") expect(ps.segment).to eq(["My name is Jonas E. Smith."]) end it "One letter lower case abbreviations #005" do ps = PragmaticSegmenter::Segmenter.new(text: "Please turn to p. 55.", language: "da") expect(ps.segment).to eq(["Please turn to p. 55."]) end it "Two letter lower case abbreviations in the middle of a sentence #006" do ps = PragmaticSegmenter::Segmenter.new(text: "Were Jane and co. at the party?", language: "da") expect(ps.segment).to eq(["Were Jane and co. at the party?"]) end it "Two letter upper case abbreviations in the middle of a sentence #007" do ps = PragmaticSegmenter::Segmenter.new(text: "They closed the deal with Pitt, Briggs & Co. at noon.", language: "da") expect(ps.segment).to eq(["They closed the deal with Pitt, Briggs & Co. at noon."]) end it "Two letter lower case abbreviations at the end of a sentence #008" do ps = PragmaticSegmenter::Segmenter.new(text: "Lad os spørge Jane og co. De burde vide det.", language: "da") expect(ps.segment).to eq(["Lad os spørge Jane og co.", "De burde vide det."]) end it "Two letter upper case abbreviations at the end of a sentence #009" do ps = PragmaticSegmenter::Segmenter.new(text: "De lukkede aftalen med Pitt, Briggs & Co. Det lukkede i går.", language: "da") expect(ps.segment).to eq(["De lukkede aftalen med Pitt, Briggs & Co.", "Det lukkede i går."]) end it "Two letter (prepositive) abbreviations #010" do ps = PragmaticSegmenter::Segmenter.new(text: "De holdt Skt. Hans i byen.", language: "da") expect(ps.segment).to eq(["De holdt Skt. Hans i byen."]) end it "Two letter (prepositive & postpositive) abbreviations #011" do ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Kirke er på 5. gade nær ved lyset.", language: "da") expect(ps.segment).to eq(["St. Michael's Kirke er på 5. gade nær ved lyset."]) end it "Possesive two letter abbreviations #012" do ps = PragmaticSegmenter::Segmenter.new(text: "That is JFK Jr.'s book.", language: "da") expect(ps.segment).to eq(["That is JFK Jr.'s book."]) end it "Multi-period abbreviations in the middle of a sentence #013" do ps = PragmaticSegmenter::Segmenter.new(text: "I visited the U.S.A. last year.", language: "da") expect(ps.segment).to eq(["I visited the U.S.A. last year."]) end it "Multi-period abbreviations at the end of a sentence #014" do ps = PragmaticSegmenter::Segmenter.new(text: "Jeg bor i E.U. Hvad med dig?", language: "da") expect(ps.segment).to eq(["Jeg bor i E.U.", "Hvad med dig?"]) end it "U.S. as sentence boundary #015" do ps = PragmaticSegmenter::Segmenter.new(text: "I live in the U.S. Hvad med dig?", language: "da") expect(ps.segment).to eq(["I live in the U.S.", "Hvad med dig?"]) end it "U.S. as non sentence boundary with next word capitalized #016" do ps = PragmaticSegmenter::Segmenter.new(text: "I work for the U.S. Government in Virginia.", language: "da") expect(ps.segment).to eq(["I work for the U.S. Government in Virginia."]) end it "U.S. as non sentence boundary #017" do ps = PragmaticSegmenter::Segmenter.new(text: "I have lived in the U.S. for 20 years.", language: "da") expect(ps.segment).to eq(["I have lived in the U.S. for 20 years."]) end it "A.M. / P.M. as non sentence boundary and sentence boundary #018" do skip "NOT IMPLEMENTED" ps = PragmaticSegmenter::Segmenter.new(text: "At 5 a.m. Hr. Smith went to the bank. He left the bank at 6 P.M. Hr. Smith then went to the store.", language: "da") expect(ps.segment).to eq(["At 5 a.m. Hr. Smith went to the bank.", "He left the bank at 6 P.M.", "Hr. Smith then went to the store."]) end it "Number as non sentence boundary #019" do ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00 in her bag.", language: "da") expect(ps.segment).to eq(["She has $100.00 in her bag."]) end it "Number as sentence boundary #020" do ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00. It is in her bag.", language: "da") expect(ps.segment).to eq(["She has $100.00.", "It is in her bag."]) end it "Parenthetical inside sentence #021" do ps = PragmaticSegmenter::Segmenter.new(text: "He teaches science (He previously worked for 5 years as an engineer.) at the local University.", language: "da") expect(ps.segment).to eq(["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]) end it "Email addresses #022" do ps = PragmaticSegmenter::Segmenter.new(text: "Her email is Jane.Doe@example.com. I sent her an email.", language: "da") expect(ps.segment).to eq(["Her email is Jane.Doe@example.com.", "I sent her an email."]) end it "Web addresses #023" do ps = PragmaticSegmenter::Segmenter.new(text: "The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", language: "da") expect(ps.segment).to eq(["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]) end it "Single quotations inside sentence #024" do ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, 'This is great.' she said.", language: "da") expect(ps.segment).to eq(["She turned to him, 'This is great.' she said."]) end it "Double quotations inside sentence #025" do ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" she said.", language: "da") expect(ps.segment).to eq(["She turned to him, \"This is great.\" she said."]) end it "Double quotations at the end of a sentence #026" do ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" Hun held the book out to show him.", language: "da") expect(ps.segment).to eq(["She turned to him, \"This is great.\"", "Hun held the book out to show him."]) end it "Double punctuation (exclamation point) #027" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello!! Long time no see.", language: "da") expect(ps.segment).to eq(["Hello!!", "Long time no see."]) end it "Double punctuation (question mark) #028" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello?? Who is there?", language: "da") expect(ps.segment).to eq(["Hello??", "Who is there?"]) end it "Double punctuation (exclamation point / question mark) #029" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello!? Is that you?", language: "da") expect(ps.segment).to eq(["Hello!?", "Is that you?"]) end it "Double punctuation (question mark / exclamation point) #030" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello?! Is that you?", language: "da") expect(ps.segment).to eq(["Hello?!", "Is that you?"]) end it "List (period followed by parens and no period to end item) #031" do ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item 2.) The second item", language: "da") expect(ps.segment).to eq(["1.) The first item", "2.) The second item"]) end it "List (period followed by parens and period to end item) #032" do ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item. 2.) The second item.", language: "da") expect(ps.segment).to eq(["1.) The first item.", "2.) The second item."]) end it "List (parens and no period to end item) #033" do ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item 2) The second item", language: "da") expect(ps.segment).to eq(["1) The first item", "2) The second item"]) end it "List (parens and period to end item) #034" do ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item. 2) The second item.", language: "da") expect(ps.segment).to eq(["1) The first item.", "2) The second item."]) end it "List (period to mark list and no period to end item) #035" do ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item 2. The second item", language: "da") expect(ps.segment).to eq(["1. The first item", "2. The second item"]) end it "List (period to mark list and period to end item) #036" do ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item. 2. The second item.", language: "da") expect(ps.segment).to eq(["1. The first item.", "2. The second item."]) end it "List with bullet #037" do ps = PragmaticSegmenter::Segmenter.new(text: "• 9. The first item • 10. The second item", language: "da") expect(ps.segment).to eq(["• 9. The first item", "• 10. The second item"]) end it "List with hypthen #038" do ps = PragmaticSegmenter::Segmenter.new(text: "⁃9. The first item ⁃10. The second item", language: "da") expect(ps.segment).to eq(["⁃9. The first item", "⁃10. The second item"]) end it "Alphabetical list #039" do ps = PragmaticSegmenter::Segmenter.new(text: "a. The first item b. The second item c. The third list item", language: "da") expect(ps.segment).to eq(["a. The first item", "b. The second item", "c. The third list item"]) end it "Errant newlines in the middle of sentences (PDF) #040" do ps = PragmaticSegmenter::Segmenter.new(text: "This is a sentence\ncut off in the middle because pdf.", language: "da", doc_type: "pdf") expect(ps.segment).to eq(["This is a sentence cut off in the middle because pdf."]) end it "Errant newlines in the middle of sentences #041" do ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "da") expect(ps.segment).to eq(["It was a cold night in the city."]) end it "Lower case list separated by newline #042" do ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: "da") expect(ps.segment).to eq(["features", "contact manager", "events, activities"]) end it "Geo Coordinates #043" do ps = PragmaticSegmenter::Segmenter.new(text: "You can find it at N°. 1026.253.553. That is where the treasure is.", language: "da") expect(ps.segment).to eq(["You can find it at N°. 1026.253.553.", "That is where the treasure is."]) end it "Named entities with an exclamation point #044" do ps = PragmaticSegmenter::Segmenter.new(text: "She works at Yahoo! in the accounting department.", language: "da") expect(ps.segment).to eq(["She works at Yahoo! in the accounting department."]) end it "Ellipsis at end of quotation #046" do ps = PragmaticSegmenter::Segmenter.new(text: "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", language: "da") expect(ps.segment).to eq(["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"]) end it "Ellipsis with square brackets #047" do ps = PragmaticSegmenter::Segmenter.new(text: "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).", language: "da") expect(ps.segment).to eq(["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."]) end it "Ellipsis as sentence boundary (standard ellipsis rules) #048" do ps = PragmaticSegmenter::Segmenter.new(text: "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", language: "da") expect(ps.segment).to eq(["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."]) end it "Ellipsis as sentence boundary (non-standard ellipsis rules) #049" do ps = PragmaticSegmenter::Segmenter.new(text: "I never meant that.... She left the store.", language: "da") expect(ps.segment).to eq(["I never meant that....", "She left the store."]) end it "Ellipsis as non sentence boundary #050" do ps = PragmaticSegmenter::Segmenter.new(text: "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", language: "da") expect(ps.segment).to eq(["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."]) end it "4-dot ellipsis #051" do ps = PragmaticSegmenter::Segmenter.new(text: "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: "da") expect(ps.segment).to eq(["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]) end it "No whitespace in between sentences #052" do ps = PragmaticSegmenter::Segmenter.new(text: "Hello world.I dag is Tuesday.Hr. Smith went to the store and bought 1,000.That is a lot.", language: "da") expect(ps.segment).to eq(["Hello world.", "I dag is Tuesday.", "Hr. Smith went to the store and bought 1,000.", "That is a lot."]) end end describe '#segment' do it 'correctly segments text #001' do ps = PragmaticSegmenter::Segmenter.new(text: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'\nSo she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.", language: 'en') expect(ps.segment).to eq(["Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'", "So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."]) end it 'correctly segments text #002' do ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\n'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #003' do ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\r'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #004' do ps = PragmaticSegmenter::Segmenter.new(text: "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en') expect(ps.segment).to eq(["'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #005' do ps = PragmaticSegmenter::Segmenter.new(text: "Down, down, down. Would the fall NEVER come to an end! 'I wonder how many miles I've fallen by this time?' she said aloud.", language: 'en') expect(ps.segment).to eq(["Down, down, down.", "Would the fall NEVER come to an end!", "'I wonder how many miles I've fallen by this time?' she said aloud."]) end it 'correctly segments text #006' do ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it. 'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en') expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) end it 'correctly segments text #007' do ps = PragmaticSegmenter::Segmenter.new(text: 'A minute is a unit of measurement of time or of angle. The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1. In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second. The minute is not an SI unit; however, it is accepted for use with SI units. The symbol for minute or minutes is min. The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system. Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length.', language: 'en') expect(ps.segment).to eq(["A minute is a unit of measurement of time or of angle.", "The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1.", "In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second.", "The minute is not an SI unit; however, it is accepted for use with SI units.", "The symbol for minute or minutes is min.", "The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system.", "Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."]) end it 'correctly segments text #008' do text = <<-EOF About Me...............................................................................................5 Chapter 2 ...................................................................... 6 Three Weeks Later............................................................................ 7 Better Eating........................................................................................ 8 What's the Score?.............................................................. 9 How To Calculate the Score................... 16-17 EOF ps = PragmaticSegmenter::Segmenter.new(text: text, language: 'en') expect(ps.segment).to eq(["About Me", "Chapter 2", "Three Weeks Later", "Better Eating", "What's the Score?", "How To Calculate the Score"]) end it 'correctly segments text #009' do ps = PragmaticSegmenter::Segmenter.new(text: 'I think Jun. is a great month, said Mr. Suzuki.', language: 'en') expect(ps.segment).to eq(["I think Jun. is a great month, said Mr. Suzuki."]) end it 'correctly segments text #010' do ps = PragmaticSegmenter::Segmenter.new(text: 'Jun. is a great month, said Mr. Suzuki.', language: 'en') expect(ps.segment).to eq(["Jun. is a great month, said Mr. Suzuki."]) end it 'correctly segments text #011' do ps = PragmaticSegmenter::Segmenter.new(text: "I have 1.000.00. Yay $.50 and .50! That's 600.", language: 'en') expect(ps.segment).to eq(["I have 1.000.00.", "Yay $.50 and .50!", "That's 600."]) end it 'correctly segments text #012' do ps = PragmaticSegmenter::Segmenter.new(text: '1.) This is a list item with a parens.', language: 'en') expect(ps.segment).to eq(["1.) This is a list item with a parens."]) end it 'correctly segments text #013' do ps = PragmaticSegmenter::Segmenter.new(text: '1. This is a list item.', language: 'en') expect(ps.segment).to eq(['1. This is a list item.']) end it 'correctly segments text #014' do ps = PragmaticSegmenter::Segmenter.new(text: 'I live in the U.S.A. I went to J.C. Penney.', language: 'en') expect(ps.segment).to eq(["I live in the U.S.A.", "I went to J.C. Penney."]) end it 'correctly segments text #015' do ps = PragmaticSegmenter::Segmenter.new(text: 'His name is Alfred E. Sloan.', language: 'en') expect(ps.segment).to eq(['His name is Alfred E. Sloan.']) end it 'correctly segments text #016' do ps = PragmaticSegmenter::Segmenter.new(text: 'Q. What is his name? A. His name is Alfred E. Sloan.', language: 'en') expect(ps.segment).to eq(['Q. What is his name?', 'A. His name is Alfred E. Sloan.']) end it 'correctly segments text #017' do ps = PragmaticSegmenter::Segmenter.new(text: 'Today is 11.18.2014.', language: 'en') expect(ps.segment).to eq(['Today is 11.18.2014.']) end it 'correctly segments text #018' do ps = PragmaticSegmenter::Segmenter.new(text: 'I need you to find 3 items, e.g. a hat, a coat, and a bag.', language: 'en') expect(ps.segment).to eq(['I need you to find 3 items, e.g. a hat, a coat, and a bag.']) end it 'correctly segments text #019' do ps = PragmaticSegmenter::Segmenter.new(text: "The game is the Giants vs. the Tigers at 10 p.m. I'm going are you?", language: 'en') expect(ps.segment).to eq(["The game is the Giants vs. the Tigers at 10 p.m.", "I'm going are you?"]) end it 'correctly segments text #020' do ps = PragmaticSegmenter::Segmenter.new(text: 'He is no. 5, the shortstop.', language: 'en') expect(ps.segment).to eq(['He is no. 5, the shortstop.']) end it 'correctly segments text #021' do ps = PragmaticSegmenter::Segmenter.new(text: "Remove long strings of dots........please.", language: 'en') expect(ps.segment).to eq(["Remove long strings of dots please."]) end it 'correctly segments text #022' do ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n.\n\n\nPricing Additionl Info\n", language: 'en') expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"]) end it 'correctly segments text #023' do ps = PragmaticSegmenter::Segmenter.new(text: "As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid.", language: 'en') expect(ps.segment).to eq(["As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid."]) end it 'correctly segments text #024' do ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: 'en') expect(ps.segment).to eq(['features', 'contact manager', 'events, activities']) end it 'correctly segments text #025' do ps = PragmaticSegmenter::Segmenter.new(text: "Git rid of unnecessary white space.", language: 'en') expect(ps.segment).to eq(["Git rid of unnecessary white space."]) end it 'correctly segments text #026' do ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n. Pricing Additionl Info", language: 'en') expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"]) end it 'correctly segments text #027' do ps = PragmaticSegmenter::Segmenter.new(text: "Organising your care early \nmeans you'll have months to build a good relationship with your midwife or doctor, ready for \nthe birth.", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["Organising your care early means you'll have months to build a good relationship with your midwife or doctor, ready for the birth."]) end it 'correctly segments text #028' do ps = PragmaticSegmenter::Segmenter.new(text: "10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:"]) end it 'correctly segments text #029' do ps = PragmaticSegmenter::Segmenter.new(text: "• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early", language: 'en', doc_type: 'pdf') expect(ps.segment).to eq(["• 9. Stop smoking", "• 10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:", "1. Organise your pregnancy care early"]) end it 'correctly segments text #030' do ps = PragmaticSegmenter::Segmenter.new(text: "I have 600. How many do you have?", language: 'en') expect(ps.segment).to eq(["I have 600.", "How many do you have?"]) end it 'correctly segments text #031' do ps = PragmaticSegmenter::Segmenter.new(text: "\n3\n\nIntroduction\n\n", language: 'en') expect(ps.segment).to eq(["Introduction"]) end it 'correctly segments text #032' do ps = PragmaticSegmenter::Segmenter.new(text: "\nW\nA\nRN\nI\nNG\n", language: 'en') expect(ps.segment).to eq(["WARNING"]) end it 'correctly segments text #033' do ps = PragmaticSegmenter::Segmenter.new(text: "\n\n\nW\nA\nRN\nI\nNG\n \n/\n \nA\nV\nE\nR\nT\nI\nS\nE\nM\nE\nNT\n", language: 'en') expect(ps.segment).to eq(["WARNING", "AVERTISEMENT"]) end it 'correctly segments text #034' do ps = PragmaticSegmenter::Segmenter.new(text: '"Help yourself, sweetie," shouted Candy and gave her the cookie.', language: 'en') expect(ps.segment).to eq(["\"Help yourself, sweetie,\" shouted Candy and gave her the cookie."]) end it 'correctly segments text #035' do ps = PragmaticSegmenter::Segmenter.new(text: "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en') expect(ps.segment).to eq(["Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."]) end it 'correctly segments text #036' do ps = PragmaticSegmenter::Segmenter.new(text: "This is a test. Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en') expect(ps.segment).to eq(["This is a test.", "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."]) end it 'correctly segments text #037' do ps = PragmaticSegmenter::Segmenter.new(text: "This was because it was an offensive weapon, designed to fight at a distance up to 400 yd \n( 365.8 m ).", language: 'en') expect(ps.segment).to eq(["This was because it was an offensive weapon, designed to fight at a distance up to 400 yd ( 365.8 m )."]) end it 'correctly segments text #038' do ps = PragmaticSegmenter::Segmenter.new(text: "“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?” Others yet say: \"Should we be scared about these 'protests'?\"", language: 'en') expect(ps.segment).to eq(["“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?”", "Others yet say: \"Should we be scared about these 'protests'?\""]) end it 'correctly segments text #039' do ps = PragmaticSegmenter::Segmenter.new(text: "www.testurl.Awesome.com", language: 'en') expect(ps.segment).to eq(["www.testurl.Awesome.com"]) end it 'correctly segments text #040' do ps = PragmaticSegmenter::Segmenter.new(text: "http://testurl.Awesome.com", language: 'en') expect(ps.segment).to eq(["http://testurl.Awesome.com"]) end it 'correctly segments text #041' do ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church in is a church.", language: 'en') expect(ps.segment).to eq(["St. Michael's Church in is a church."]) end it 'correctly segments text #042' do ps = PragmaticSegmenter::Segmenter.new(text: "JFK Jr.'s book is on sale.", language: 'en') expect(ps.segment).to eq(["JFK Jr.'s book is on sale."]) end it 'correctly segments text #043' do ps = PragmaticSegmenter::Segmenter.new(text: "This is e.g. Mr. Smith, who talks slowly... And this is another sentence.", language: 'en') expect(ps.segment).to eq(["This is e.g. Mr. Smith, who talks slowly...", "And this is another sentence."]) end it 'correctly segments text #044' do ps = PragmaticSegmenter::Segmenter.new(text: "Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.", language: 'en') expect(ps.segment).to eq(["Leave me alone!, he yelled.", "I am in the U.S. Army.", "Charles (Ind.) said he."]) end it 'correctly segments text #045' do ps = PragmaticSegmenter::Segmenter.new(text: "This is the U.S. Senate my friends. Yes. It is!", language: 'en') expect(ps.segment).to eq(["This is the U.S. Senate my friends.", "Yes.", "It is!"]) end it 'correctly segments text #046' do ps = PragmaticSegmenter::Segmenter.new(text: "Send it to P.O. box 6554", language: 'en') expect(ps.segment).to eq(["Send it to P.O. box 6554"]) end it 'correctly segments text #047' do ps = PragmaticSegmenter::Segmenter.new(text: "There were 500 cases in the U.S. The U.S. Commission asked the U.S. Government to give their opinion on the issue.", language: 'en') expect(ps.segment).to eq(["There were 500 cases in the U.S.", "The U.S. Commission asked the U.S. Government to give their opinion on the issue."]) end it 'correctly segments text #048' do ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en') expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"]) end it 'correctly segments text #049' do ps = PragmaticSegmenter::Segmenter.new(text: "Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990. `So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna. Later, he recalls the words of his Marxist mentor: `The people! Theft! The holy fire!'", language: 'en') expect(ps.segment).to eq(["Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990.", "'So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna.", "Later, he recalls the words of his Marxist mentor: 'The people! Theft! The holy fire!'"]) end it 'correctly segments text #050' do ps = PragmaticSegmenter::Segmenter.new(text: "He climbed Mt. Fuji.", language: 'en') expect(ps.segment).to eq(["He climbed Mt. Fuji."]) end it 'correctly segments text #051' do ps = PragmaticSegmenter::Segmenter.new(text: "He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun.", language: 'en') expect(ps.segment).to eq(["He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun."]) end it 'correctly segments text #052' do ps = PragmaticSegmenter::Segmenter.new(text: "Test strange period.Does it segment correctly.", language: 'en') expect(ps.segment).to eq(["Test strange period.", "Does it segment correctly."]) end it 'correctly segments text #053' do ps = PragmaticSegmenter::Segmenter.new(text: "
This is a test. Another test.
\n\n\n