# -*- encoding : utf-8 -*- module PragmaticSegmenter module Rules module HtmlRules # Rubular: http://rubular.com/r/ENrVFMdJ8v HTMLTagRule = Rule.new(/<\/?[^>]*>/, '') # Rubular: http://rubular.com/r/XZVqMPJhea EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '') All = [HTMLTagRule, EscapedHTMLTagRule] end end # This is an opinionated class that removes errant newlines, # xhtml, inline formatting, etc. class Cleaner include Rules # Rubular: http://rubular.com/r/V57WnM9Zut NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '') # Rubular: http://rubular.com/r/3GiRiP2IbD NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/ # Rubular: http://rubular.com/r/UZAVcwqck8 PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '') # Rubular: http://rubular.com/r/eaNwGavmdo PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ') # Rubular: http://rubular.com/r/bAJrhyLNeZ InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*> text = "This is a sentence\ncut off in the middle because pdf." # >> PragmaticSegmenter::Cleaner(text: text).clean # => "This is a sentence cut off in the middle because pdf." # # Arguments: # text: (String) *required # language: (String) *optional # (two-digit ISO 639-1 code e.g. 'en') # doc_type: (String) *optional # (e.g. 'pdf') def clean return unless text @clean_text = remove_all_newlines(text) replace_double_newlines(@clean_text) replace_newlines(@clean_text) replace_escaped_newlines(@clean_text) @clean_text.apply(HtmlRules::All) @clean_text.apply(InlineFormattingRule) clean_quotations(@clean_text) clean_table_of_contents(@clean_text) clean_consecutive_characters(@clean_text) end private def remove_all_newlines(txt) clean_text = remove_newline_in_middle_of_sentence(txt) remove_newline_in_middle_of_word(clean_text) end def remove_newline_in_middle_of_sentence(txt) txt.dup.gsub!(/(?:[^\.])*/) do |match| next unless match.include?("\n") orig = match.dup match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '') txt.gsub!(/#{Regexp.escape(orig)}/, "#{match}") end txt end def remove_newline_in_middle_of_word(txt) txt.apply(NewLineInMiddleOfWordRule) end def replace_escaped_newlines(txt) txt.apply(EscapedNewLineRule). apply(EscapedCarriageReturnRule) end def replace_double_newlines(txt) txt.apply(DoubleNewLineWithSpaceRule). apply(DoubleNewLineRule) end def replace_newlines(txt) if doc_type.eql?('pdf') remove_pdf_line_breaks(txt) else txt.apply(NewLineFollowedByPeriodRule). apply(ReplaceNewlineWithCarriageReturnRule) end end def remove_pdf_line_breaks(txt) txt.apply(NewLineFollowedByBulletRule). apply(PDF_NewLineInMiddleOfSentenceRule). apply(PDF_NewLineInMiddleOfSentenceNoSpacesRule) end def clean_quotations(txt) txt.apply(QuotationsFirstRule). apply(QuotationsSecondRule) end def clean_table_of_contents(txt) txt.apply(TableOfContentsRule). apply(ConsecutivePeriodsRule). apply(ConsecutiveForwardSlashRule) end def clean_consecutive_characters(txt) txt.apply(ConsecutivePeriodsRule). apply(ConsecutiveForwardSlashRule) end end end