Sha256: 901032375dfcf4cb67b5f3f35729785da3c486c6189fe1e5e68db4907d457cf3
Contents?: true
Size: 570 Bytes
Versions: 6
Compression:
Stored size: 570 Bytes
Contents
# -*- encoding : utf-8 -*- module PragmaticSegmenter # This class splits text at sentence boundary punctuation marks class SentenceBoundaryPunctuation SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])*”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/ attr_reader :text def initialize(text:) @text = text end def split text.scan(SENTENCE_BOUNDARY_REGEX) end end end
Version data entries
6 entries across 6 versions & 1 rubygems