module PragmaticSegmenter # This is an opinionated class that removes errant newlines, # xhtml, inline formatting, etc. class Cleaner module Rules # Rubular: http://rubular.com/r/V57WnM9Zut NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '') # Rubular: http://rubular.com/r/dMxp5MixFS DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r") # Rubular: http://rubular.com/r/H6HOJeA8bq DoubleNewLineRule = Rule.new(/\n\n/, "\r") # Rubular: http://rubular.com/r/FseyMiiYFT NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '') ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r") EscapedNewLineRule = Rule.new(/\\n/, "\n") EscapedCarriageReturnRule = Rule.new(/\\r/, "\r") TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n") TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r") # Rubular: http://rubular.com/r/bAJrhyLNeZ InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*\s]+))?)+\s*|\s*)\/?>/, '') # Rubular: http://rubular.com/r/XZVqMPJhea EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '') All = [HTMLTagRule, EscapedHTMLTagRule] end module PDF # Rubular: http://rubular.com/r/UZAVcwqck8 NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '') # Rubular: http://rubular.com/r/eaNwGavmdo NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ') end end end end