lib/linguist/heuristics.rb in github-linguist-5.3.1 vs lib/linguist/heuristics.rb in github-linguist-5.3.2
- old
+ new
@@ -1,8 +1,10 @@
module Linguist
# A collection of simple heuristics that can be used to better analyze languages.
class Heuristics
+ HEURISTICS_CONSIDER_BYTES = 50 * 1024
+
# Public: Use heuristics to detect language of the blob.
#
# blob - An object that quacks like a blob.
# possible_languages - Array of Language objects
#
@@ -12,11 +14,11 @@
# Language["Ruby"], Language["Python"]
# ])
#
# Returns an Array of languages, or empty if none matched or were inconclusive.
def self.call(blob, candidates)
- data = blob.data
+ data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
@heuristics.each do |heuristic|
if heuristic.matches?(blob.name, candidates)
return Array(heuristic.call(data))
end
@@ -70,10 +72,18 @@
@heuristic.call(data)
end
# Common heuristics
ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
+ CPlusPlusRegex = Regexp.union(
+ /^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/,
+ /^\s*template\s*</,
+ /^[ \t]*try/,
+ /^[ \t]*catch\s*\(/,
+ /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/,
+ /^[ \t]*(private|public|protected):$/,
+ /std::\w+/)
disambiguate ".as" do |data|
if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data)
Language["ActionScript"]
else
@@ -217,12 +227,11 @@
end
disambiguate ".h" do |data|
if ObjectiveCRegex.match(data)
Language["Objective-C"]
- elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
- /^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
+ elsif CPlusPlusRegex.match(data)
Language["C++"]
end
end
disambiguate ".inc" do |data|
@@ -356,13 +365,13 @@
Language["Perl 6"]
end
end
disambiguate ".pm" do |data|
- if /^\s*(?:use\s+v6\s*;|(?:\bmy\s+)?class|module)\b/.match(data)
- Language["Perl 6"]
- elsif /\buse\s+(?:strict\b|v?5\.)/.match(data)
+ if /\buse\s+(?:strict\b|v?5\.)/.match(data)
Language["Perl"]
+ elsif /^\s*(?:use\s+v6\s*;|(?:\bmy\s+)?class|module)\b/.match(data)
+ Language["Perl 6"]
elsif /^\s*\/\* XPM \*\//.match(data)
Language["XPM"]
end
end