lib/file_signature.rb in file_signature-1.1.1 vs lib/file_signature.rb in file_signature-1.2.0
- old
+ new
@@ -3,122 +3,343 @@
Please see README
=end
class IO
- # We implement magic by using a lookup hash.
- # The key is a string that encodes the first bits.
+ # Most signatures are implemented using a lookup hash.
+ # The key is a string consisting of the first several bytes.
# The value is a symbol that indicates the magic type.
+ # If none of these match, we look for more complicated matches
+ # below.
#
# Examples:
- # IO::MagicNumberType("BM") => :bitmap
- # IO::MagicNumberType("GIF8") => :gif
- # IO::MagicNumberType("\xa6\x00") => :pgp_encrypted_data
+ # IO::MagicNumberTypeMap("BM") => :bitmap
+ # IO::MagicNumberTypeMap("GIF8") => :gif
+ # IO::MagicNumberTypeMap("\xa6\x00") => :pgp_encrypted_data
#
- # Quirks:
- # - JPEG adjustment:
- # - Some cameras put JPEG Exif data in bytes 3 & 4,
- # so we only check the first two bytes of a JPEG.
- # - TIFF has two possible matches:
- # - MM** is Motorola big endian
- # - II** is Intel little ending
- #
# See:
# - IO#magic_number_type
# - File.magic_number_type
+ # - IO#mime_type
+ # - File.mime_type
SignatureMap = {
"BC" => :bitcode,
+ [0xDE,0xC0,0x17,0x0B].pack('c*') => :bitcode,
"BM" => :bitmap,
- "BZ" => :bzip,
+ "BZh" => :bzip2,
"MZ" => :exe,
"SIMPLE"=> :fits,
- "GIF8" => :gif,
+ "GIF87a" => :gif,
+ "GIF89a" => :gif,
"GKSM" => :gks,
[0x01,0xDA].pack('c*') => :iris_rgb,
- [0xF1,0x00,0x40,0xBB].pack('c*') => :itc,
- [0xFF,0xD8].pack('c*') => :jpeg,
+ [0xFF,0xD8,0xFF].pack('c*') => :jpeg,
+ [0x00,0x00,0x00,0x0C,0x6A,0x50,0x20,0x20,0x0D,0x0A].pack('c*') => :jpeg2000,
"IIN1" => :niff,
"MThd" => :midi,
"%PDF" => :pdf,
- "VIEW" => :pm,
- [0x89].pack('c*') + "PNG" => :png,
- "%!" => :postscript,
+ "{\\rtf" => :rtf,
+ [0x89].pack('c*') + "PNG" + [0x0D,0x0A,0x1A,0x0A].pack('c*') => :png,
+ "%!PS-Adobe-" => :postscript,
"Y" + [0xA6].pack('c*') + "j" + [0x95].pack('c*') => :sun_rasterfile,
- "MM*" + [0x00].pack('c*') => :tiff,
+ "MM" + [0x00,0x2A].pack('c*') => :tiff,
+ "MM" + [0x00,0x2B].pack('c*') => :tiff,
"II*" + [0x00].pack('c*') => :tiff,
+ "II+" + [0x00].pack('c*') => :tiff,
"gimp xcf" => :gimp_xcf,
"#FIG" => :xfig,
"/* XPM */" => :xpm,
- [0x23,0x21].pack('c*') => :shebang,
+ "#!" => :shebang,
[0x1F,0x9D].pack('c*') => :compress,
- [0x1F,0x8B].pack('c*') => :gzip,
- "PK" + [0x03,0x04].pack('c*') => :pkzip,
- "MZ" => :dos_os2_windows_executable,
- ".ELF" => :unix_elf,
+ [0x1F,0x8B,0x08].pack('c*') => :gzip,
+ "7z" + [0xBC,0xAF,0x27,0x1C].pack('c*') => :p7zip,
+ "Rar!" + [0x1A,0x07,0x00].pack('c*') => :rar,
+ [0x1A,0x45,0xDF,0xA3].pack('c*') => :webm,
+ [0x4F,0x67,0x67,0x53,0x00].pack('c*') => :ogg,
+ "fLaC" + [0x00,0x00,0x00,0x22].pack('c*') => :flac,
+ [0x00,0x00,0x01,0x00].pack('c*') => :ico,
+ [0x49,0x44,0x33].pack('c*') => :mp3,
+ "#EXTM3U" => :m3u8,
+ [0x7F].pack('c*') + "ELF" => :unix_elf,
[0x99,0x00].pack('c*') => :pgp_public_ring,
- [0x95,0x01].pack('c*') => :pgp_security_ring,
- [0x95,0x00].pack('c*') => :pgp_security_ring,
+ [0x99,0x01].pack('c*') => :gnupg_public_ring,
+ [0x95,0x01].pack('c*') => :pgp_secret_ring,
+ [0x95,0x00].pack('c*') => :pgp_secret_ring,
[0xA6,0x00].pack('c*') => :pgp_encrypted_data,
- [0xD0,0xCF,0x11,0xE0].pack('c*') => :docfile
+ [0x85,0x01].pack('c*') => :pgp_encrypted_data,
+ [0x85,0x02].pack('c*') => :pgp_encrypted_data,
}
- SignatureSize = SignatureMap.keys.inject(0){ |m,k| k.length > m ? k.length : m }
+ MimeTypeMap = {
+ :compress => 'application/x-compress',
+ :gzip => 'application/x-gzip',
+ :pkzip => 'application/zip',
+ :p7zip => 'application/x-7z-compressed',
+ :ppt => 'application/vnd.ms-powerpoint',
+ :pptx => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+ :xls => 'application/vnd.ms-excel',
+ :xlsx => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ :tar => 'application/x-tar',
+ :rar => 'application/x-rar-compressed',
+ :webm => 'video/webm',
+ :avi => 'video/x-msvideo',
+ :ogg => 'application/ogg',
+ :ico => 'image/vnd.microsoft.icon',
+ :mp3 => 'audio/mpeg',
+ :mp4 => 'video/mp4',
+ :video_3gpp => 'video/3gpp',
+ :video_3gpp2 => 'video/3gpp2',
+ :quicktime => 'video/quicktime',
+ :m4v => 'video/x-m4v',
+ :m4a => 'audio/mp4a-latm',
+ :aiff => 'audio/x-aiff',
+ :flac => 'audio/flac',
+ :niff => 'image/x-niff',
+ :midi => 'audio/midi',
+ :fits => 'image/fits',
+ :gimp_xcf => 'image/xcf',
+ :unix_elf => 'application/octet-stream',
+ :bitcode => 'application/octet-stream',
+ :gks => 'application/octet-stream',
+ :iris_rgb => 'application/octet-stream',
+ :pgp_encrypted_data => 'application/octet-stream',
+ :pgp_secret_ring => 'application/x-pgp-keyring',
+ :pgp_public_ring => 'application/x-pgp-keyring',
+ :gnupg_public_ring => 'application/x-pgp-keyring',
+ :docfile => 'application/msword',
+ :docx => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ :xfig => 'application/x-fig',
+ :xpm => 'image/x-xpixmap',
+ :shebang => 'text/plain',
+ :bitmap => 'image/bmp',
+ :png => 'image/png',
+ :gif => 'image/gif',
+ :jpeg => 'image/jpeg',
+ :jpeg2000 => 'image/jp2',
+ :sun_rasterfile => 'image/x-cmu-raster',
+ :postscript => 'application/postscript',
+ :pdf => 'application/pdf',
+ :rtf => 'text/rtf',
+ :jar => 'application/java-archive',
+ :tiff => 'image/tiff',
+ :bzip2 => 'application/x-bzip2',
+ :exe => 'application/x-msdownload',
+ :wave => 'audio/wave',
+ :webp => 'image/webp',
+ :m3u8 => 'application/vnd.apple.mpegURL'
+ }
+ SignatureSize = SignatureMap.keys.inject(16){ |m,k| k.length > m ? k.length : m }
# Detect the data type by checking various "magic number" conventions
# for the introductory bytes of a data stream
#
# Return the "magic number" as a symbol:
# - :bitmap = Bitmap image file, typical extension ".bmp"
# - :gzip = Unix GZIP compressed data, typical extension ".gz"
# - :postscript = Postscript pages, typical extension ".ps"
#
- # Return nil if there's no match for any known magic number.
+ # Return nil if there's no match for a known magic number.
#
# Example:
# f = File.open("test.ps","rb")
- # put f.magic_number(s)
+ # f.magic_number_type
# => :postscript
#
# See:
- # - IO::MagicNumberTypeHash
+ # - IO::MagicNumberTypeMap
+ # - IO::MimeTypeMap
# - File.magic_number_type
+ # - IO#mime_type
+ # - File.mime_type
def magic_number_type
+ #--
return @magic_number_memo if defined? @magic_number_memo
bytes = ""
+ bytes.force_encoding("ASCII-8BIT") if bytes.respond_to?(:force_encoding)
type = nil
- while bytes.size < SignatureSize
- bytes += read(1)
+ read(SignatureSize).each_byte do |b|
+ bytes << b
type = SignatureMap[bytes]
- break if type
+ return @magic_number_memo = type if type
end
+ # some cases require a more complicated match
+ case bytes[0,4]
+ when 'FORM'
+ type = :aiff if bytes[8,3] == 'AIF'
+ when 'RIFF'
+ case bytes[8,8]
+ when 'WAVEfmt '
+ type = :wave
+ when 'AVI LIST'
+ type = :avi
+ when /WEBPVP/
+ type = :webp
+ end
+ when "PK\003\004"
+ # What looks like a zip archive could contain various things
+ seek(30,IO::SEEK_SET)
+ # Look at the filename of the first file
+ case read(19)
+ when /META-INF\/PK/
+ type = :jar
+ when '[Content_Types].xml'
+ # This is a .docx, .pptx, or .xlsx file
+ # To figure out which, is grotty
+ curr_bytes = ['', '', '', '']
+ # skip to the 3rd file in the zip archive
+ 2.times do
+ while (c = getc) do
+ curr_bytes.push(c).shift
+ break if curr_bytes.join == "PK\003\004"
+ end
+ end
+ # and look at its filename
+ seek(26, IO::SEEK_CUR)
+ case read(5)
+ when "word/"
+ type = :docx
+ when /ppt\//
+ type = :pptx
+ when /xl\//
+ type = :xlsx
+ end
+ else
+ type = :pkzip
+ end
+ when [0xD0,0xCF,0x11,0xE0].pack('c*')
+ if bytes[4,4] == [0xA1,0xB1,0x1A,0xE1].pack('c*')
+ # MS Office documents have further magic bytes @ 512 byte-offset
+ seek(512,IO::SEEK_SET)
+ more_bytes = read(16)
+ case more_bytes[0,4]
+ when "\011\010\020\000"
+ type = :xls
+ when [0x60,0x21,0x1B,0xF0].pack('c*'),
+ [0x00,0x6E,0x1E,0xF0].pack('c*')
+ type = :ppt
+ when [0xEC,0xA5,0xC1,0x00].pack('c*')
+ type = :docfile
+ when [0xFD,0xFF,0xFF,0xFF].pack('c*')
+ case more_bytes[12,4]
+ when [0x04,0x00,0x00,0x00].pack('c*')
+ type = :xls
+ when [0x2D,0x00,0x00,0x00].pack('c*')
+ type = :docfile
+ end
+ end
+ end
+ end
+ return (@magic_number_memo = type) if type
+
+ case bytes[4,4]
+ when 'moov'
+ type = :quicktime
+ when 'ftyp'
+ case bytes[8,3]
+ when 'iso', 'mp4', 'avc'
+ type = :mp4
+ when '3ge', '3gg', '3gp'
+ type = :video_3gpp
+ when '3g2'
+ type = :video_3gpp2
+ when 'M4A'
+ type = :m4a
+ when 'M4V'
+ type = :m4v
+ when 'qt '
+ type = :quicktime
+ end
+ end
+ return @magic_number_memo = type if type
+
+ #TAR files have magic bytes @ 257 byte-offset
+ seek(257,IO::SEEK_SET)
+ bytes = read(8)
+ type = :tar if (bytes[0,6] == "ustar\000" || bytes[0,8] == "ustar\040\040\000")
+
@magic_number_memo = type
end
+ #++
+ # Return the MIME type of the IO stream
+ # It's obtained by first finding the magic_number,
+ # and then looking up the MIME type from a hash.
+ # Returns 'application/octet-stream' for unknown types
+ #
+ # Example:
+ # f = File.open("test.ps","rb")
+ # f.mime_type
+ # => "application/postscript"
+ #
+ # See:
+ # - IO::MagicNumberTypeMap
+ # - IO::MimeTypeMap
+ # - IO#magic_number_type
+ # - File.magic_number_type
+ # - File.mime_type
+
+ def mime_type
+ return @mime_memo if defined? @mime_memo
+ type = self.magic_number_type
+ if type
+ m = MimeTypeMap[type]
+ else
+ m = 'application/octet-stream'
+ end
+ @mime_memo = m
+ end
+
end
class File
# Detect the file's data type by opening the file then
- # using IO#magic_number_type to read the first bits.
+ # using IO#magic_number_type to read the first few bytes.
#
# Return a magic number type symbol, e.g. :bitmap, :jpg, etc.
+ # Returns nil if the data type is unknown
#
# Example:
- # puts File.magic_number_type("test.ps") => :postscript
+ # File.magic_number_type("test.ps")
+ # => :postscript
#
# See
- # - IO#MagicNumberTypeHash
- # - IO#magic_number_type
+ # - IO::MagicNumberTypeMap
+ # - IO::MimeTypeMap
+ # - IO#magic_number_type
+ # - IO#mime_type
+ # - File.mime_type
def self.magic_number_type(file_name)
File.open(file_name,"rb"){|f| f.magic_number_type }
+ end
+
+ # Detect the file's data type by opening the file then
+ # using IO#magic_number_type to read the first few bytes.
+ #
+ # Return the MIME type of the file.
+ # Returns 'application/octet-stream' for unknown types.
+ #
+ # Example:
+ # File.mime_type("test.ps")
+ # => "application/postscript"
+ #
+ # See
+ # - IO::MagicNumberTypeMap
+ # - IO::MimeTypeMap
+ # - IO#magic_number_type
+ # - IO#mime_type
+ # - File.magic_number_type
+
+ def self.mime_type(file_name)
+ File.open(file_name,"rb"){|f| f.mime_type }
end
end