require 'ffi/aspell' require 'open3' require 'set' require 'tmpdir' require 'yaml' require 'damerau-levenshtein' require 'clausewitz/spelling/results' module Clausewitz; module Spelling class Checker attr_accessor :dict_words def initialize(opts = {}) @suggestion_count = opts[:suggestion_count] || 3 @english_dialect = opts[:english_dialect] || 'GB' @english_dialect = "en_#{@english_dialect}" @en_speller = FFI::Aspell::Speller.new( @english_dialect, encoding: 'UTF-8' ) @spanish_dialect = opts[:spanish_dialect] || 'es' @spanish_dialect = "es_#{@spanish_dialect}" if opts[:spanish_dialect] @es_speller = FFI::Aspell::Speller.new( @spanish_dialect, encoding: 'UTF-8' ) dict_path = opts[:custom_wordlist] if dict_path fail("No such file #{dict_path}!") unless File.exist?(dict_path) @dict_words = Set.new(File.read(dict_path).lines.map(&:chomp).to_a) @custom_words_filepath = generate_word_list @en_speller.set('extra-dicts', @custom_words_filepath) @en_speller.set('ignore-accents', true) else @dict_words = Set.new([]) @en_speller.set('ignore-accents', true) end end def check_file(file_path) $stderr.puts "Checking #{file_path}..." loc = load_file(file_path) # Poorly formatted YAML files often lack proper indentation; you can # easily discover this by checking to make sure all top level keys are # actual language names. bad_keys = loc.keys.select { |key| bad_lang_key(key) } bad_keys.map! { |key| unsmudge_key(key) } if !bad_keys.empty? UnknownLangsFileResult.new(file_path, bad_keys) else results = loc.map do |lang, entries| check_entries(lang, entries) end FileResults.new(file_path, results) end rescue Psych::SyntaxError => e # If we fail to load the file it's probably busted. BadFormatFileResult.new(file_path, e) end def check_files(file_paths) results = Array(file_paths).map do |file_path| check_file(file_path) unless File.directory?(file_path) end OverallResults.new(results.compact) end def check_entries(lang, entries) misspellings = [] if entries && !entries.empty? entries.each do |key, text| result = check_entry(key, text) misspellings << result unless result.check_results.empty? end end LangResults.new(lang, misspellings) end def check_entry(key, text) misspellings = [] text = preprocess_entry(text) text.split(' ').each_with_index do |word, index| unless check_word(word) || misspellings.any? { |ms| ms.misspelled_word == word } misspellings << CheckResult.new(word, suggest_words(word)) end end KeyResults.new(unsmudge_key(key), misspellings) end SQUIGGLE = '§' def check_word(word) return true if word.chars.count('.') > 1 word.gsub!(/^(#{SQUIGGLE}.|[[:punct:]])+/, '') word.gsub!(/(#{SQUIGGLE}.|[[:punct:]])+$/, '') not_word?(word) || @dict_words.include?(word) || @en_speller.correct?(word) || @es_speller.correct?(word) end def load_file(file_path) contents = nil File.open(file_path, 'r:UTF-8') do |f| contents = f.read end contents = contents.lines.map do |line| smudge_key(line) end.join("\n") YAML.load(contents) end def not_word?(word) is_percentage?(word) || is_number?(word) || is_ordinal?(word) || word =~ /^£/ end private def bad_lang_key(key) key !~ /^l_.+/ end def is_ordinal?(word) word =~ /[0-9]+(th|st|nd|rd)/ end def is_number?(word) Float(word) != nil rescue false end # Tries to detect if a word is a percentage and can be skipped. def is_percentage?(word) word =~ /(-|\+)?[0-9]+(\.[0-9]+)?%/ || word =~ /%(-|\+)?[0-9]+(\.[0-9]+)?/ end # Loads our custom wordlist into a temporary Aspell dictionary. # This way Aspell won't yell at us for custom words and will also # potentially select from this list as suggestions for misspelled words. def generate_word_list dir = Dir.mktmpdir('custom-wordlist-') output = File.join(dir, 'fallout_words.wlst') cmd = %W[ aspell --lang=en --encoding=UTF-8 create master #{output} ] value = nil Open3.popen3(*cmd) do |stdin, stdout, stderr, wait_thr| @dict_words.each do |word| stdin.puts(word) end stdin.close value = wait_thr.value end fail("Could not generate custom word list!") unless value.success? output end def preprocess_entry(entry) entry.gsub(/\[.+\]/, '') end def smudge_key(key) key.sub(/\:([0-9]+) /, "!!!1: ") end def suggest_words(word) return [] if word.size < 5 suggestions = Set.new aspell_suggestions = [] aspell_suggestions.concat(@en_speller.suggestions(word)) aspell_suggestions.concat(@es_speller.suggestions(word)) custom_suggestions = @dict_words.select do |dict_word| DamerauLevenshtein.distance(word, dict_word) < word.size end aspell_suggestions.each { |sug| suggestions.add(sug) } custom_suggestions.each { |sug| suggestions.add(sug) } suggestions.to_a.sort_by do |sug| DamerauLevenshtein.distance(sug, word) end.first(@suggestion_count) end def unsmudge_key(key) key.gsub(/!!!([0-9]+)$/, ":1") end end end; end # Clausewitz::Spelling