require 'linguist/tokenizer' module Linguist # Language bayesian classifier. class Classifier CLASSIFIER_CONSIDER_BYTES = 50 * 1024 # Public: Use the classifier to detect language of the blob. # # blob - An object that quacks like a blob. # possible_languages - Array of Language objects # # Examples # # Classifier.call(FileBlob.new("path/to/file"), [ # Language["Ruby"], Language["Python"] # ]) # # Returns an Array of Language objects, most probable first. def self.call(blob, possible_languages) language_names = possible_languages.map(&:name) classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _| Language[name] # Return the actual Language objects end end # Public: Train classifier that data is a certain language. # # db - Hash classifier database object # language - String language of data # data - String contents of file # # Examples # # Classifier.train(db, 'Ruby', "def hello; end") # # Returns nothing. # # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or # per-language. See also #dump_all_tokens, below. def self.train!(db, language, data) tokens = data tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String) counts = Hash.new(0) tokens.each { |tok| counts[tok] += 1 } db['tokens_total'] ||= 0 db['languages_total'] ||= 0 db['tokens'] ||= {} db['language_tokens'] ||= {} db['languages'] ||= {} counts.each do |token, count| db['tokens'][language] ||= {} db['tokens'][language][token] ||= 0 db['tokens'][language][token] += count db['language_tokens'][language] ||= 0 db['language_tokens'][language] += count db['tokens_total'] += count end db['languages'][language] ||= 0 db['languages'][language] += 1 db['languages_total'] += 1 nil end # Public: Guess language of data. # # db - Hash of classifier tokens database. # data - Array of tokens or String data to analyze. # languages - Array of language name Strings to restrict to. # # Examples # # Classifier.classify(db, "def hello; end") # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ] # # Returns sorted Array of result pairs. Each pair contains the # String language name and a Float score. def self.classify(db, tokens, languages = nil) languages ||= db['languages'].keys new(db).classify(tokens, languages) end # Internal: Initialize a Classifier. def initialize(db = {}) @tokens_total = db['tokens_total'] @languages_total = db['languages_total'] @tokens = db['tokens'] @language_tokens = db['language_tokens'] @languages = db['languages'] @unknown_logprob = Math.log(1 / db['tokens_total'].to_f) end # Internal: Guess language of data # # data - Array of tokens or String data to analyze. # languages - Array of language name Strings to restrict to. # # Returns sorted Array of result pairs. Each pair contains the # String language name and a Float score. def classify(tokens, languages) return [] if tokens.nil? || languages.empty? tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String) scores = {} debug_dump_all_tokens(tokens, languages) if verbosity >= 2 counts = Hash.new(0) tokens.each { |tok| counts[tok] += 1 } languages.each do |language| scores[language] = tokens_probability(counts, language) + language_probability(language) debug_dump_probabilities(counts, language, scores[language]) if verbosity >= 1 end scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] } end # Internal: Probably of set of tokens in a language occurring - P(D | C) # # tokens - Array of String tokens. # language - Language to check. # # Returns Float between 0.0 and 1.0. def tokens_probability(counts, language) sum = 0 counts.each do |token, count| sum += count * token_probability(token, language) end sum end # Internal: Log-probability of token in language occurring - P(F | C) # # token - String token. # language - Language to check. # # Returns Float. def token_probability(token, language) count = @tokens[language][token] if count.nil? || count == 0 # This is usually the most common case, so we cache the result. @unknown_logprob else Math.log(count.to_f / @language_tokens[language].to_f) end end # Internal: Probably of a language occurring - P(C) # # language - Language to check. # # Returns Float between 0.0 and 1.0. def language_probability(language) Math.log(@languages[language].to_f / @languages_total.to_f) end private def verbosity @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i end def debug_dump_probabilities(tokens, language, score) printf("%10s = %10.3f + %7.3f = %10.3f\n", language, tokens_probability(tokens, language), language_probability(language), score) end # Internal: show a table of probabilities for each pair. # # The number in each table entry is the number of "points" that each # token contributes toward the belief that the file under test is a # particular language. Points are additive. # # Points are the number of times a token appears in the file, times # how much more likely (log of probability ratio) that token is to # appear in one language vs. the least-likely language. Dashes # indicate the least-likely language (and zero points) for each token. def debug_dump_all_tokens(tokens, languages) maxlen = tokens.map { |tok| tok.size }.max printf "%#{maxlen}s", "" puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join token_map = Hash.new(0) tokens.each { |tok| token_map[tok] += 1 } token_map.sort.each { |tok, count| arr = languages.map { |lang| [lang, token_probability(tok, lang)] } min = arr.map { |a,b| b }.min if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] } printf "%#{maxlen}s%5d", tok, count puts arr.map { |ent| ent[1] == min ? " -" : sprintf("%10.3f", count * (ent[1] - min)) }.join end } end end end