# encoding: utf-8 require 'linguistics' Linguistics.use(:en, classes: [String]) module Ebooks module NLP # We don't necessarily want to use all of this stuff all the time # Only load it when it is needed def self.stopwords @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split end def self.nouns @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split end def self.adjectives @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split end def self.tokenizer # This tokenizer is used for dividing sentences into words # It's too slow for finding sentences in paragraphs, hence tactful require 'tokenizer' @tokenizer ||= Tokenizer::Tokenizer.new(:en) end def self.tactful require 'tactful_tokenizer' @tactful ||= TactfulTokenizer::Model.new end def self.tagger require 'engtagger' @tagger ||= EngTagger.new end def self.stemmer require 'lingua/stemmer' @stemmer ||= Lingua::Stemmer.new end def self.gingerice require 'gingerice' Gingerice::Parser.new # No caching for this one end def self.htmlentities require 'htmlentities' @htmlentities ||= HTMLEntities.new end ### Utility functions which wrap the above def self.sentences(text) tactful.tokenize_text(text) end def self.normalize(text) htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...') end def self.tokenize(sentence) # This is hacky, but an ad hoc approach seems to be # most reliable for now. Tokenization libraries have oddities # that are hard to correct. sentence.split(/\s/).map do |token| exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/] if exceptions.find { |r| r.match(token) } token else token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/) end end.flatten end def self.tokenset(sentence) tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence) tokens.map(&:downcase) .reject { |token| stopwords.include?(token) } .to_set end def self.space_between?(token1, token2) p1 = self.punctuation?(token1) p2 = self.punctuation?(token2) if p1 && p2 # "foo?!" false elsif !p1 && p2 # "foo." false elsif p1 && !p2 # "foo. rah" true else # "foo rah" true end end def self.reconstruct(tokens) # Put tokens back together into a nice looking sentence text = "" last_token = nil tokens.each do |token| text += ' ' if last_token && space_between?(last_token, token) text += token last_token = token end text end # Deliberately limit our punctuation handling to stuff we can do consistently # It'll just be a part of a token if we don't split it out, and that's fine PUNCTUATION = ".?!," def self.punctuation?(token) (token.chars.to_set - PUNCTUATION.chars.to_set).empty? end def self.unmatched_enclosers?(text) # Weird quotes are an instant giveaway. Let's do paren-matching. enclosers = ['**', '""', '()', '[]', '``'] enclosers.each do |pair| starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S') ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)') opened = 0 tokenize(text).each do |token| opened += 1 if token.match(starter) opened -= 1 if token.match(ender) return true if opened < 0 # Too many ends! end return true if opened != 0 # Mismatch somewhere. end false end end end