module Linguist # A collection of simple heuristics that can be used to better analyze languages. class Heuristics HEURISTICS_CONSIDER_BYTES = 50 * 1024 # Public: Use heuristics to detect language of the blob. # # blob - An object that quacks like a blob. # possible_languages - Array of Language objects # # Examples # # Heuristics.call(FileBlob.new("path/to/file"), [ # Language["Ruby"], Language["Python"] # ]) # # Returns an Array of languages, or empty if none matched or were inconclusive. def self.call(blob, candidates) data = blob.data[0...HEURISTICS_CONSIDER_BYTES] @heuristics.each do |heuristic| if heuristic.matches?(blob.name, candidates) return Array(heuristic.call(data)) end end [] # No heuristics matched end # Internal: Define a new heuristic. # # exts_and_langs - String names of file extensions and languages to # disambiguate. # heuristic - Block which takes data as an argument and returns a Language or nil. # # Examples # # disambiguate ".pm" do |data| # if data.include?("use strict") # Language["Perl"] # elsif /^[^#]+:-/.match(data) # Language["Prolog"] # end # end # def self.disambiguate(*exts_and_langs, &heuristic) @heuristics << new(exts_and_langs, &heuristic) end # Internal: Array of defined heuristics @heuristics = [] # Internal def initialize(exts_and_langs, &heuristic) @exts_and_langs, @candidates = exts_and_langs.partition {|e| e =~ /\A\./} @heuristic = heuristic end # Internal: Check if this heuristic matches the candidate filenames or # languages. def matches?(filename, candidates) filename = filename.downcase candidates = candidates.compact.map(&:name) @exts_and_langs.any? { |ext| filename.end_with?(ext) } || (candidates.any? && (@candidates - candidates == [] && candidates - @candidates == [])) end # Internal: Perform the heuristic def call(data) @heuristic.call(data) end # Common heuristics ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/ CPlusPlusRegex = Regexp.union( /^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/, /^\s*template\s*)/.match(data) Language["Erlang"] elsif /(?:\/\/|("|')use strict\1|export\s+default\s|\/\*.*?\*\/)/m.match(data) Language["JavaScript"] end end fortran_rx = /^([c*][^abd-z]| (subroutine|program|end|data)\s|\s*!)/i disambiguate ".f" do |data| if /^: /.match(data) Language["Forth"] elsif data.include?("flowop") Language["Filebench WML"] elsif fortran_rx.match(data) Language["Fortran"] end end disambiguate ".for" do |data| if /^: /.match(data) Language["Forth"] elsif fortran_rx.match(data) Language["Fortran"] end end disambiguate ".fr" do |data| if /^(: |also |new-device|previous )/.match(data) Language["Forth"] elsif /^\s*(import|module|package|data|type) /.match(data) Language["Frege"] else Language["Text"] end end disambiguate ".fs" do |data| if /^(: |new-device)/.match(data) Language["Forth"] elsif /^\s*(#light|import|let|module|namespace|open|type)/.match(data) Language["F#"] elsif /^\s*(#version|precision|uniform|varying|vec[234])/.match(data) Language["GLSL"] elsif /#include|#pragma\s+(rs|version)|__attribute__/.match(data) Language["Filterscript"] end end disambiguate ".gs" do |data| Language["Gosu"] if /^uses java\./.match(data) end disambiguate ".h" do |data| if ObjectiveCRegex.match(data) Language["Objective-C"] elsif CPlusPlusRegex.match(data) Language["C++"] end end disambiguate ".inc" do |data| if /^<\?(?:php)?/.match(data) Language["PHP"] elsif /^\s*#(declare|local|macro|while)\s/.match(data) Language["POV-Ray SDL"] end end disambiguate ".l" do |data| if /\(def(un|macro)\s/.match(data) Language["Common Lisp"] elsif /^(%[%{}]xs|<.*>)/.match(data) Language["Lex"] elsif /^\.[a-z][a-z](\s|$)/i.match(data) Language["Roff"] elsif /^\((de|class|rel|code|data|must)\s/.match(data) Language["PicoLisp"] end end disambiguate ".ls" do |data| if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data) Language["LoomScript"] else Language["LiveScript"] end end disambiguate ".lsp", ".lisp" do |data| if /^\s*\((defun|in-package|defpackage) /i.match(data) Language["Common Lisp"] elsif /^\s*\(define /.match(data) Language["NewLisp"] end end disambiguate ".m" do |data| if ObjectiveCRegex.match(data) Language["Objective-C"] elsif data.include?(":- module") Language["Mercury"] elsif /^: /.match(data) Language["MUF"] elsif /^\s*;/.match(data) Language["M"] elsif /\*\)$/.match(data) Language["Mathematica"] elsif /^\s*%/.match(data) Language["Matlab"] elsif /^\w+\s*:\s*module\s*{/.match(data) Language["Limbo"] end end disambiguate ".md" do |data| if /(^[-a-z0-9=#!\*\[|>])|<\//i.match(data) || data.empty? Language["Markdown"] elsif /^(;;|\(define_)/.match(data) Language["GCC Machine Description"] else Language["Markdown"] end end disambiguate ".ml" do |data| if /(^\s*module)|let rec |match\s+(\S+\s)+with/.match(data) Language["OCaml"] elsif /=> |case\s+(\S+\s)+of/.match(data) Language["Standard ML"] end end disambiguate ".mod" do |data| if data.include?(')\s*(\d{2}:\d{2}:\d{2},\d{3})$/.match(data) Language["SubRip Text"] end end disambiguate ".t" do |data| if /^\s*%[ \t]+|^\s*var\s+\w+\s*:=\s*\w+/.match(data) Language["Turing"] elsif /^\s*(?:use\s+v6\s*;|\bmodule\b|\b(?:my\s+)?class\b)/.match(data) Language["Perl 6"] elsif /\buse\s+(?:strict\b|v?5\.)/.match(data) Language["Perl"] end end disambiguate ".toc" do |data| if /^## |@no-lib-strip@/.match(data) Language["World of Warcraft Addon Data"] elsif /^\\(contentsline|defcounter|beamer|boolfalse)/.match(data) Language["TeX"] end end disambiguate ".ts" do |data| if data.include?(" ")) Language["GAP"] # Heads up - we don't usually write heuristics like this (with no regex match) else Language["Scilab"] end end disambiguate ".tsx" do |data| if /^\s*(import.+(from\s+|require\()['"]react|\/\/\/\s*