Sha256: 8d7226d4bcf220b60db6a3409155c28c3b72deea148f3630a5f8aa50fb8676dc
Contents?: true
Size: 582 Bytes
Versions: 7
Compression:
Stored size: 582 Bytes
Contents
# -*- encoding : utf-8 -*- module PragmaticSegmenter # This class splits text at sentence boundary punctuation marks class SentenceBoundaryPunctuation SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/ attr_reader :text def initialize(text:) @text = text end def split text.scan(SENTENCE_BOUNDARY_REGEX) end end end
Version data entries
7 entries across 7 versions & 1 rubygems