require 'ffi/hunspell' require 'open3' require 'pathname' require 'set' require 'tmpdir' require 'yaml' require 'damerau-levenshtein' require 'pragmatic_tokenizer' require 'clausewitz/localisation' require 'clausewitz/spelling/results' module Clausewitz; module Spelling class Checker DEFAULT_SUGGESTION_COUNT = 3 def initialize(opts = {}) @custom_dict_root = opts[:custom_dict_root] @custom_dict_root = Pathname.new(@custom_dict_root) if @custom_dict_root @custom_dicts = opts[:custom_dicts] || [] @dialect_map = opts[:dialect_map] || {} @suggestion_count = opts[:suggestion_count] || DEFAULT_SUGGESTION_COUNT load_dictionaries! end def load_dictionaries! @loaded_dicts = {} Localisation::LANG_MAP.each do |_, config| if @dialect_map.key?(config.name) config.select_dialect(@dialect_map[config.name]) end dict = FFI::Hunspell.dict(config.full_name) @custom_dicts.each do |custom_dict| path = @custom_dict_root.join("#{config.full_name}_#{custom_dict}") path = Pathname.new("#{path}.dic") if path.exist? dict.add_dic(path.to_s) else $stderr.puts("Could not load dictionary '#{path}', skipping...") end end @loaded_dicts[config.name] = dict end end def check_file(filepath) results = [] begin filepath = Pathname.new(filepath) validate_filepath!(filepath) rescue => e return InvalidFilepathResult.new(filepath, e) end $stderr.puts "Skipping directory '#{filepath}'..." if filepath.directory? begin contents = Clausewitz::Localisation.parse_file(filepath) rescue => e return UnparseableFileResult.new(filepath, e) end checks = contents.map do |lang_name, entries| lc = language_config(lang_name) check_entries(entries, lc) end FileResults.new(filepath, checks) end private def check_entries(entries, lc) spellcheck_ignore = entries&.delete('spellcheck_ignore') ignored_keys = spellcheck_ignore ? spellcheck_ignore.split(',') : [] ignored_keys << 'spellcheck_ignore' if ignored_keys.include?('all') return IgnoredLangResult.new(lc.clausewitz_name) end return LangResults.new(lc.clausewitz_name, []) unless entries checks = entries.map do |key, entry| if ignored_keys.include?(key) IgnoredEntryResult.new(key) else check_entry(key, entry, lc) end end LangResults.new(lc.clausewitz_name, checks) end def check_entry(key, entry, lc) return NullEntryResult.new(key) unless entry # We don't want to pay attention to scripted localisation, so we'll strip # it out before we start. # TODO: Look into supporting escaped square brackets as part of the # string. entry.gsub!(/\[.+\]/, '') entry.gsub!(/\$([A-Z]|\||\d|=)+\$/, '') # Remove other localisation bits we don't care about. entry.gsub!(/§(%|\*|=|\d|W|G|R|B|Y|b|M|g|T|l|H|\+|-|!)/, '') ## We should also remove punctuation that is never part of words, like ## exclamation points, commas, semi-colons, and question marks. ## We should be using proper apostrophes for possessives in our loc. #entry.gsub!(/(!|;|\?|"|“|”|…|:|\(|\))/, '') ## If a word has one full stop at the end with no other full stops ## elsewhere in the word, it's probably an acronym or initialism like ## U.S.A. and so we should avoid stripping it. Otherwise, it's probably ## the end of a sentence and can be ignored. #words = entry.split(/\s|—/) #words.map! do |word| # word.sub!(/^'/, '') # word.sub!(/'?,?'?$/, '') # if word.end_with?('...') # word.sub(/\.\.\.$/, '') # elsif word =~ /[[:alpha:]]\.$/ && word.chars.count('.') == 1 # word.sub(/\.$/, '') # elsif word =~ /\d\.$/ && word.chars.count('.') <= 2 # word.sub(/\.$/, '') # else # word # end #end.join(" ") opts = { language: lc.base.to_sym, punctuation: :none, downcase: false } words = PragmaticTokenizer::Tokenizer.new(opts).tokenize(entry) words = words.map { |word| word.split('—') }.flatten(1) words.map! do |word| if word =~ /[[:alpha:]]\.$/ && word.chars.count('.') == 1 word.sub(/\.$/, '') else word end end checks = words.map { |word| check_word(word, lc) }.compact EntryResults.new(key, checks) end def check_word(word, lc) return if is_number?(word) return if is_plural_number?(word) return if is_ordinal?(word) return if is_percentage?(word) return if is_icon?(word) return if is_initial?(word) return if is_psalm?(word) lang_dict = @loaded_dicts[lc.name] if !lang_dict.check?(word) suggestions = lang_dict.suggest(word).take(@suggestion_count) MisspelledWordResult.new(word, suggestions) end end def is_plural_number?(word) word =~ /\d+'s/ end def is_psalm?(word) word =~ /^\d+:\d+$/ end def is_initial?(word) word =~ /^[A-Z]\.$/ end def is_icon?(word) word =~ /^£\w+/ end def is_number?(word) Float(word) != nil rescue false end def is_ordinal?(word) word =~ /[0-9]+(th|st|nd|rd)/ end def is_percentage?(word) word =~ /(-|\+)?[0-9]+(\.[0-9]+)?%/ || word =~ /%(-|\+)?[0-9]+(\.[0-9]+)?/ end def language_config(language_name) language_name = "l_#{language_name}" if language_name !~ /^l_/ lang_config = Localisation::LANG_MAP.find do |config_key, _| language_name == config_key end fail("Unknown language '#{language_name}'!") unless lang_config lang_config.last end # Make sure a file to be checked is actually present and readable. def validate_filepath!(filepath) fail("No such file '#{filepath}'!") unless filepath.exist? fail("Cannot read '#{filepath}'!") unless filepath.readable? end end end; end