Sha256: 936b61839d79530cce2168966be0a0dc631e0e94a79abd2b2e29c5d9d4e3d3f6
Contents?: true
Size: 1.31 KB
Versions: 1
Compression:
Stored size: 1.31 KB
Contents
# -*- coding: UTF-8 -*- # Persian module module Persian # Persian tokenize class class Tokenizer # Basic persian word tokenizer # Return an array of words def self.tokenize(text) symbols = ['!', '﷼', ':', '؛', '؟', '،', '-', '.'] pair_pre = ['(', '{', '«', '<', '['] pair_post = [')', '}', '»', '>', ']'] prepost = ["'", '"'] # Split text with space characters splits = text.split(/\s/) return [''] if splits.empty? options = symbols + pair_pre + pair_post + prepost pattern = /[^#{Regexp.escape(options.join)}]+/ tokens = [] splits.each do |split| first, middle, last = split.partition(pattern) tokens << first.split unless first.empty? tokens << middle unless middle.empty? tokens << last.split unless last.empty? end tokens.flatten end def self.tokenize_more(text, num) list = tokenize(text) tokens = [] 0.upto list.size - num do |i| token = '' 0.upto num - 1 do |j| token += list[i + j] + ' ' end tokens.push token.strip end tokens end # Split paragraphs # Return an array of paragraphs def self.split_paragraphs(text) text = text.split("\n").reject(&:empty?) text end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
persian-0.2.2 | lib/persian/tokenizer.rb |