Sha256: 9b5bca4eec04f42fde86792cfabf7ad424eaa9dc01df5840d165b6cb352df28a
Contents?: true
Size: 842 Bytes
Versions: 1
Compression:
Stored size: 842 Bytes
Contents
# frozen_string_literal: true # Author:: Lucas Carlson (mailto:lucas@rufy.com) # Copyright:: Copyright (c) 2005 Lucas Carlson # License:: LGPL require_relative 'token' module ClassifierReborn module Tokenizer # This tokenizes given input as white-space separated terms. # It mainly aims to tokenize sentences written with a space between words, like English, French, and others. module Whitespace module_function def call(str) tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word| Token.new(word, stemmable: true, maybe_stopword: true) end symbol_tokens = str.scan(/[^\s\p{WORD}]/).collect do |word| Token.new(word, stemmable: false, maybe_stopword: false) end tokens += symbol_tokens tokens end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
classifier-reborn-2.3.0 | lib/classifier-reborn/extensions/tokenizer/whitespace.rb |