# -*- encoding : utf-8 -*- require 'pragmatic_segmenter/punctuation_replacer' module PragmaticSegmenter # This class searches for punctuation between quotes or parenthesis # and replaces it class BetweenPunctuation # Rubular: http://rubular.com/r/2YFrKWQUYi BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/ # Rubular: http://rubular.com/r/3Pw1QlXOjd BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/ # Rubular: http://rubular.com/r/x6s4PZK8jc BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/ # Rubular: http://rubular.com/r/JbAIpKdlSq BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/ # Rubular: http://rubular.com/r/6tTityPflI BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/ attr_reader :text def initialize(text:) @text = text end def replace sub_punctuation_between_quotes_and_parens(text) end private def sub_punctuation_between_quotes_and_parens(txt) sub_punctuation_between_single_quotes(txt) sub_punctuation_between_double_quotes(txt) sub_punctuation_between_parens(txt) sub_punctuation_between_quotes_arrow(txt) sub_punctuation_between_quotes_slanted(txt) end def sub_punctuation_between_parens(txt) PragmaticSegmenter::PunctuationReplacer.new( matches_array: txt.scan(BETWEEN_PARENS_REGEX), text: txt ).replace end def sub_punctuation_between_single_quotes(txt) PragmaticSegmenter::PunctuationReplacer.new( matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX), text: txt ).replace end def sub_punctuation_between_double_quotes(txt) btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX) PragmaticSegmenter::PunctuationReplacer.new( matches_array: btwn_dbl_quote, text: txt ).replace end def sub_punctuation_between_quotes_arrow(txt) PragmaticSegmenter::PunctuationReplacer.new( matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX), text: txt ).replace end def sub_punctuation_between_quotes_slanted(txt) PragmaticSegmenter::PunctuationReplacer.new( matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX), text: txt ).replace end end end