# # disOps.py v 1.0.0 # # Copyright (C) 2003-2012 Gil Dabah, http://ragestorm.net/distorm/ # # disOps is a part of the diStorm project, but can be used for anything. # The generated output is tightly coupled with diStorm data structures which can be found at instructions.h. # The code in diStorm that actually walks these structures is found at instructions.c. # # Since the DB was built purposely for diStorm, there are some # Known issues: # 1. ARPL/MOVSXD information in DB is stored as ARPL. # Since ARPL and MOVSXD share the same opcode this DB doesn't support this mix. # Therefore, if you use this DB for x64 instructions, you have to take care of this one. # # 2. SSE CMP pseudo instructions have the DEFAULT suffix letters of its type in the second mnemonic, # the third operand, Imm8 which is responsible for determining the suffix, # doesn't appear in the operands list but rather an InstFlag.PSEUDO_OPCODE implies this behavior. # # 3. The WAIT instruction is a bit problematic from a static DB point of view, read the comments in init_FPU in x86sets.py. # # 4. The OpLen.OL_33, [0x66, 0x0f, 0x78, 0x0], ["EXTRQ"] is very problematic as well. # Since there's another 8 group table after the 0x78 byte in this case, but it's already a Prefixed table. # Therefore, we will handle it as a normal 0x78 instruction with a mandatory prefix of 0x66. # But the REG (=0) field of the ModRM byte will be checked in the decoder by a flag that states so. # Otherwise, another normal table after Prefixed table really complicates matters, # and doesn't worth the hassle for one exceptional instruction. # # 5. The NOP (0x90) instruction is really set in the DB as xchg rAX, rAX. Rather than true NOP, this is because of x64 behavior. # Hence, it will be decided in runtime when decoding streams according to the mode. # # 6. The PAUSE (0xf3, 0x90) instruction isn't found in the DB, it will be returned directly by diStorm. # This is because the 0xf3 in this case is not a mandatory prefix, and we don't want it to be built as part of a prefixed table. # # 7. The IO String instructions don't have explicit form and they don't support segments. # It's up to diStorm to decide what to do with the operands and which segment is default and overrided. # # 8. Since opcodeId is an offset into the mnemonics table, the psuedo compare mnemonics needs a helper table to fix the offset. # Psuedo compare instructions work in such a way that only the first instruction is defined in the DB. # The rest are found using the third operand (that's why they are psuedo). # # To maximize the usage of this DB, one should learn the documentation of diStorm regarding the InstFlag and Operands Types. # import time import x86sets import x86db from x86header import * mnemonicsIds = {} # mnemonic : offset to mnemonics table of strings. idsCounter = len("undefined") + 2 # Starts immediately after this one. SSECmpTypes = ["EQ", "LT", "LE", "UNORD", "NEQ", "NLT", "NLE", "ORD"] AVXCmpTypes = ["EQ", "LT", "LE", "UNORD", "NEQ", "NLT", "NLE", "ORD", "EQ_UQ", "NGE", "NGT", "FALSE", "NEQ_OQ", "GE", "GT", "TRUE", "EQ_OS", "LT_OQ", "LE_OQ", "UNORD_S", "NEQ_US", "NLT_UQ", "NLE_UQ", "ORD_S", "EQ_US", "NGE_UQ", "NGT_UQ", "FALSE_OS", "NEQ_OS", "GE_OQ", "GT_OQ", "TRUE_US"] # Support SSE pseudo compare instructions. We will have to add them manually. def FixPseudo(mnems): return [mnems[0] + i + mnems[1] for i in SSECmpTypes] # Support AVX pseudo compare instructions. We will have to add them manually. def FixPseudo2(mnems): return [mnems[0] + i + mnems[1] for i in AVXCmpTypes] def TranslateMnemonics(pseudoClassType, mnems): global mnemonicsIds global idsCounter l = [] if pseudoClassType == ISetClass.SSE or pseudoClassType == ISetClass.SSE2: mnems = FixPseudo(mnems) elif pseudoClassType == ISetClass.AVX: mnems = FixPseudo2(mnems) for i in mnems: if len(i) == 0: # Some mnemonics are empty on purpose because they're not used. # Set them to zero to keep the order of the list. l.append(0) # Undefined instruction. continue if mnemonicsIds.has_key(i): l.append(mnemonicsIds[i]) else: mnemonicsIds[i] = idsCounter l.append(idsCounter) idsCounter += len(i) + 2 # For len/null chars. if idsCounter >= 2**16: raise "opcodeId is too big to fit into uint16_t" return l # All VIAL and diStorm3 code are based on the order of this list, do NOT edit! REGISTERS = [ "RAX", "RCX", "RDX", "RBX", "RSP", "RBP", "RSI", "RDI", "R8", "R9", "R10", "R11", "R12", "R13", "R14", "R15", "XX", "EAX", "ECX", "EDX", "EBX", "ESP", "EBP", "ESI", "EDI", "R8D", "R9D", "R10D", "R11D", "R12D", "R13D", "R14D", "R15D", "XX", "AX", "CX", "DX", "BX", "SP", "BP", "SI", "DI", "R8W", "R9W", "R10W", "R11W", "R12W", "R13W", "R14W", "R15W", "XX", "AL", "CL", "DL", "BL", "AH", "CH", "DH", "BH", "R8B", "R9B", "R10B", "R11B", "R12B", "R13B", "R14B", "R15B", "XX", "SPL", "BPL", "SIL", "DIL", "XX", "ES", "CS", "SS", "DS", "FS", "GS", "XX", "RIP", "XX", "ST0", "ST1", "ST2", "ST3", "ST4", "ST5", "ST6", "ST7", "XX", "MM0", "MM1", "MM2", "MM3", "MM4", "MM5", "MM6", "MM7", "XX", "XMM0", "XMM1", "XMM2", "XMM3", "XMM4", "XMM5", "XMM6", "XMM7", "XMM8", "XMM9", "XMM10", "XMM11", "XMM12", "XMM13", "XMM14", "XMM15", "XX", "YMM0", "YMM1", "YMM2", "YMM3", "YMM4", "YMM5", "YMM6", "YMM7", "YMM8", "YMM9", "YMM10", "YMM11", "YMM12", "YMM13", "YMM14", "YMM15", "XX", "CR0", "", "CR2", "CR3", "CR4", "", "", "", "CR8", "XX", "DR0", "DR1", "DR2", "DR3", "", "", "DR6", "DR7"] def CreatePythonDict(mnemonicsIds): """ Create the opcodes dictionary for Python. """ s = "\n" for i in mnemonicsIds: s += "0x%x: \"%s\", " % (mnemonicsIds[i], i) if len(s) - s.rfind("\n") >= 76: s = s[:-1] + "\n" open("python_output.txt", "w").write(s) def CreateJavaDict(mnemonicsIds): """ Create the opcodes dictionary/enum for Java. """ s = "\nmOpcodes.put(0, OpcodeEnum.UNDEFINED);\n" for i in mnemonicsIds: s += "mOpcodes.put(0x%x, OpcodeEnum.%s);\n" % (mnemonicsIds[i], i.replace(" ", "_").replace(",", "")) open("java_enums.txt", "w").write(s) s = "\nUNDEFINED, " for i in mnemonicsIds: s += "%s, " % (i.replace(" ", "_").replace(",", "")) if len(s) - s.rfind("\n") >= 76: s = s[:-1] + "\n" open("java_mnemonics.txt", "w").write(s) def DumpMnemonics(): global mnemonicsIds f = open("defs.txt", "w") f.write("typedef enum {\n\tI_UNDEFINED = 0, ") pos = 0 l2 = sorted(mnemonicsIds.keys()) for i in l2: s = "I_%s = %d" % (i.replace(" ", "_").replace(",", ""), mnemonicsIds[i]) if i != l2[-1]: s += "," pos += len(s) if pos >= 70: s += "\n\t" pos = 0 elif i != l2[-1]: s += " " f.write(s) f.write("\n} _InstructionType;\n\n") regsText = "const _WRegister _REGISTERS[] = {\n\t" regsEnum = "typedef enum {\n\t" old = "*" unused = 0 for i in REGISTERS: if old != "*": if old == "XX": regsText += "\n\t" regsEnum += "\n\t" old = i continue else: regsText += "{%d, \"%s\"}," % (len(old), old) if len(old): regsEnum += "R_%s," % old else: regsEnum += "R_UNUSED%d," % unused unused += 1 if i != "XX": regsText += " " regsEnum += " " old = i regsText += "{%d, \"%s\"}\n};\n" % (len(old), old) regsEnum += "R_" + old + "\n} _RegisterType;\n" f.write(regsEnum + "\n") # Mnemonics are sorted by insertion order. (Psuedo mnemonics depend on this!) s = "const unsigned char _MNEMONICS[] =\n\"\\x09\" \"UNDEFINED\\0\" " l = zip(mnemonicsIds.keys(), mnemonicsIds.values()) l.sort(lambda x, y: x[1] - y[1]) for i in l: s += "\"\\x%02x\" \"%s\\0\" " % (len(i[0]), i[0]) if len(s) - s.rfind("\n") >= 76: s += "\\\n" s = s[:-1] + ";\n\n" # Ignore last space. f.write(s) f.write(regsText + "\n") f.close() # Used for Python/Java dictionary of opcodeIds-->mnemonics. CreatePythonDict(mnemonicsIds) CreateJavaDict(mnemonicsIds) O_NONE = 0 # REG standalone O_REG = 1 # IMM standalone O_IMM = 2 # IMM_1 standalone O_IMM_1 = 4 # IMM_2 standalone O_IMM_2 = 5 # DISP standlone O_DISP = 3 # MEM uses DISP O_MEM = 3 # PC uses IMM O_PC = 2 # PTR uses IMM O_PTR = 2 _OPT2T = {OperandType.NONE : O_NONE, OperandType.IMM8 : O_IMM, OperandType.IMM16 : O_IMM, OperandType.IMM_FULL : O_IMM, OperandType.IMM32 : O_IMM, OperandType.SEIMM8 : O_IMM, OperandType.IMM16_1 : O_IMM_1, OperandType.IMM8_1 : O_IMM_1, OperandType.IMM8_2 : O_IMM_2, OperandType.REG8 : O_REG, OperandType.REG16 : O_REG, OperandType.REG_FULL : O_REG, OperandType.REG32 : O_REG, OperandType.REG32_64 : O_REG, OperandType.FREG32_64_RM : O_REG, OperandType.RM8 : O_MEM, OperandType.RM16 : O_MEM, OperandType.RM_FULL : O_MEM, OperandType.RM32_64 : O_MEM, OperandType.RM16_32 : O_MEM, OperandType.FPUM16 : O_MEM, OperandType.FPUM32 : O_MEM, OperandType.FPUM64 : O_MEM, OperandType.FPUM80 : O_MEM, OperandType.R32_M8 : O_MEM, OperandType.R32_M16 : O_MEM, OperandType.R32_64_M8 : O_MEM, OperandType.R32_64_M16 : O_MEM, OperandType.RFULL_M16 : O_MEM, OperandType.CREG : O_REG, OperandType.DREG : O_REG, OperandType.SREG : O_REG, OperandType.SEG : O_REG, OperandType.ACC8 : O_REG, OperandType.ACC16 : O_REG, OperandType.ACC_FULL : O_REG, OperandType.ACC_FULL_NOT64 : O_REG, OperandType.MEM16_FULL : O_MEM, OperandType.PTR16_FULL : O_PTR, OperandType.MEM16_3264 : O_MEM, OperandType.RELCB : O_PC, OperandType.RELC_FULL : O_PC, OperandType.MEM : O_MEM, OperandType.MEM_OPT : O_MEM, OperandType.MEM32 : O_MEM, OperandType.MEM32_64 : O_MEM, OperandType.MEM64 : O_MEM, OperandType.MEM128 : O_MEM, OperandType.MEM64_128 : O_MEM, OperandType.MOFFS8 : O_MEM, OperandType.MOFFS_FULL : O_MEM, OperandType.CONST1 : O_IMM, OperandType.REGCL : O_REG, OperandType.IB_RB : O_REG, OperandType.IB_R_FULL : O_REG, OperandType.REGI_ESI : O_MEM, OperandType.REGI_EDI : O_MEM, OperandType.REGI_EBXAL : O_MEM, OperandType.REGI_EAX : O_MEM, OperandType.REGDX : O_REG, OperandType.REGECX : O_REG, OperandType.FPU_SI : O_REG, OperandType.FPU_SSI : O_REG, OperandType.FPU_SIS : O_REG, OperandType.MM : O_REG, OperandType.MM_RM : O_REG, OperandType.MM32 : O_MEM, OperandType.MM64 : O_MEM, OperandType.XMM : O_REG, OperandType.XMM_RM : O_REG, OperandType.XMM16 : O_MEM, OperandType.XMM32 : O_MEM, OperandType.XMM64 : O_MEM, OperandType.XMM128 : O_MEM, OperandType.REGXMM0 : O_REG, OperandType.RM32 : O_MEM, OperandType.REG32_64_M8 : O_MEM, OperandType.REG32_64_M16 : O_MEM, OperandType.WREG32_64 : O_REG, OperandType.WRM32_64 : O_REG, OperandType.WXMM32_64 : O_MEM, OperandType.VXMM : O_REG, OperandType.XMM_IMM : O_IMM, OperandType.YXMM : O_REG, OperandType.YXMM_IMM : O_REG, OperandType.YMM : O_REG, OperandType.YMM256 : O_MEM, OperandType.VYMM : O_REG, OperandType.VYXMM : O_REG, OperandType.YXMM64_256 : O_MEM, OperandType.YXMM128_256 : O_MEM, OperandType.LXMM64_128 : O_MEM, OperandType.LMEM128_256 : O_MEM } def CheckOTCollisions(ii): """ Checks whether an instruction has two or more operands that use the same fields in the diStorm3 structure. E.G: ENTER 0x10, 0x1 --> This instruction uses two OT_IMM, which will cause a collision and use the same field twice which is bougs. """ types = map(lambda x: _OPT2T[x], ii.operands) # Regs cannot cause a collision, since each register is stored inside the operand itself. for i in types: if i != O_REG and types.count(i) > 1: print "**WARNING: Operand type collision for instruction: " + ii.mnemonics[0], ii.tag break # This fucntion for certain flow control related instructions will set their type. def UpdateForFlowControl(ii): if ii.mnemonics[0].find("CMOV") == 0: ii.flowControl = FlowControl.CMOV return # Should I include SYSCALL ? pairs = [ (["INT", "INT1", "INT 3", "INTO", "UD2"], FlowControl.INT), (["CALL", "CALL FAR"], FlowControl.CALL), (["RET", "IRET", "RETF"], FlowControl.RET), (["SYSCALL", "SYSENTER", "SYSRET", "SYSEXIT"], FlowControl.SYS), (["JMP", "JMP FAR"], FlowControl.UNC_BRANCH), (["JCXZ", "JO", "JNO", "JB", "JAE", "JZ", "JNZ", "JBE", "JA", "JS", "JNS", "JP", "JNP", "JL", "JGE", "JLE", "JG", "LOOP", "LOOPZ", "LOOPNZ"], FlowControl.CND_BRANCH) ] ii.flowControl = 0 for p in pairs: if ii.mnemonics[0] in p[0]: ii.flowControl = p[1] return def UpdateWritableDestinationOperand(ii): " Mark dst-wrt flag for all Integer instructions that write to GPR/mem. " prefixes = ["MOV", "SET", "CMOV", "CMPXCHG"] for i in prefixes: if ii.mnemonics[0].find(i) == 0: ii.flags |= InstFlag.DST_WR return mnemonics = [ "ADD", "OR", "ADC", "SBB", "AND", "SUB", "XOR", "INC", "DEC", "LEA", "XCHG", "ROL", "ROR", "RCL", "RCR", "SHL", "SHR", "SAL", "SAR", "SHLD", "SHRD", "NEG", "NOT", "MUL", "IMUL", "DIV", "IDIV", "POP", "BTR", "BTS", "BTC", "XADD", "BSWAP", "LZCNT", "MOVBE", "POPCNT", "CRC32", "SMSW" ] for i in mnemonics: if ii.mnemonics[0] in i: ii.flags |= InstFlag.DST_WR return # Make sure it's an FPU instruction before we continue. if ii.classType != ISetClass.FPU: return fpu_mnemonics = [ "FSTENV", "FSTCW", "FSAVE", "FSTSW", "FST", "FSTP", "FNSTENV", "FNSTCW", "FIST", "FISTP", "FNSAVE", "FBSTP", "FNSTSW" ] for i in fpu_mnemonics: if ii.mnemonics[0] in i: if len(ii.operands) > 0: # Ignore operands of FPU STi. if ii.operands[0] not in [OperandType.FPU_SI, OperandType.FPU_SSI, OperandType.FPU_SIS]: ii.flags |= InstFlag.DST_WR return def UpdatePrivilegedInstruction(opcodeIds, ii): """ Checks whether a given mnemonic from the given list is privileged, and changes the relevant opcodeId to indicate so. Most significant bit of the OpcodeId is the indicator. """ def IsPrivilegedMov(ii): " Check for MOV instruction with Debug/Control registers which is privileged. " return (ii.mnemonics[0] == "MOV") and ((OperandType.CREG in ii.operands) or (OperandType.DREG in ii.operands)) privileged = [ "LGDT", "LLDT", "LTR", "LIDT", "LMSW", "CLTS", "INVD", "WBINVD", "INVLPG", "HLT", "RDMSR", "WRMSR", "RDPMC", "RDTSC", # IO Sensitive Instructions, mostly allowed by ring0 only. "IN", "INS", "OUT", "OUTS", "CLI", "STI", "IRET" ] for i in enumerate(ii.mnemonics): if (i[1] in privileged) or IsPrivilegedMov(ii): opcodeIds[i[0]] |= 0x8000 def SetInstructionAffectedFlags(ii, flagsTuple): """ Helper routine to set the m/t/u flags for an instruction info. """ # Pad tuple for fast access. if not isinstance(flagsTuple, type(())): flagsTuple = (flagsTuple,) flagsTuple += (0,) * (3 - len(flagsTuple)) ii.modifiedFlags = flagsTuple[0] ii.testedFlags = flagsTuple[1] ii.undefinedFlags = flagsTuple[2] def GetTestedFlagsForCondition(cond): OF, SF, ZF, AF, PF, CF, IF, DF = CPUFlags.OF, CPUFlags.SF, CPUFlags.ZF, CPUFlags.AF, CPUFlags.PF, CPUFlags.CF, CPUFlags.IF, CPUFlags.DF Conditions = { "O": OF, "NO": OF, "B": CF, "AE": CF, "Z": ZF, "NZ": ZF, "BE": CF | ZF, "A": CF | ZF, "S": SF, "NS": SF, "P": PF, "NP": PF, "L": SF | OF, "GE": SF | OF, "LE": SF | OF | ZF, "G": SF | OF | ZF, # Special for FCMOV "U": PF, "NU": PF, "E": ZF, "NE": ZF, "NB": CF, "NBE": CF | ZF } # Return tested flags only. return (0, Conditions[cond], 0) def UpdateInstructionAffectedFlags(ii): """ Add flags for each instruction that is in the following table. We add modified/tested/undefined flags. Note that some instruction reset specific flags, but we don't record that here, we only care about actually modified ones. """ # MNEM: MODIFIED, TEST, UNDEFINED. OF, SF, ZF, AF, PF, CF, IF, DF = CPUFlags.OF, CPUFlags.SF, CPUFlags.ZF, CPUFlags.AF, CPUFlags.PF, CPUFlags.CF, CPUFlags.IF, CPUFlags.DF InstByMnem = { "AAA": (AF | CF, AF, OF | SF | ZF | PF), "AAS": (AF | CF, AF, OF | SF | ZF | PF), "AAD": (SF | ZF | PF, 0, OF | AF | CF), "AAM": (SF | ZF | PF, 0, OF | AF | CF), "ADC": (OF | SF | ZF | AF | PF | CF, CF), "ADD": (OF | SF | ZF | AF | PF | CF), "AND": (OF | SF | ZF | PF | CF, 0, AF), "ARPL": (ZF), "BSF": (ZF, 0, OF | SF | ZF | AF | PF | CF), "BSR": (ZF, 0, OF | SF | ZF | AF | PF | CF), "BT": (CF, 0, OF | SF | ZF | AF | PF), "BTS": (CF, 0, OF | SF | ZF | AF | PF), "BTR": (CF, 0, OF | SF | ZF | AF | PF), "BTC": (CF, 0, OF | SF | ZF | AF | PF), "CLC": (CF), "CLD": (DF), "CLI": (IF), "CMC": (CF), "CMP": (OF | SF | ZF | AF | PF | CF), "CMPXCHG": (OF | SF | ZF | AF | PF | CF), "CMPXCHG8B": (ZF), "CMPXCHG16B": (ZF), # Same inst as previous. "COMSID": (ZF | PF | CF), "COMISS": (ZF | PF | CF), "DAA": (SF | ZF | AF | PF | CF, AF | CF, OF), "DAS": (SF | ZF | AF | PF | CF, AF | CF, OF), "DEC": (OF | SF | ZF | AF | PF), "DIV": (0, 0, OF | SF | ZF | AF | PF | CF), "FCOMI": (ZF | PF | CF), "FCOMIP": (ZF | PF | CF), "FUCOMI": (ZF | PF | CF), "FUCOMIP": (ZF | PF | CF), "IDIV": (0, 0, OF | SF | ZF | AF | PF | CF), "IMUL": (OF | CF, 0, SF | ZF | AF | PF), "INC": (OF | SF | ZF | AF | PF), "UCOMSID": (ZF | PF | CF), "UCOMISS": (ZF | PF | CF), "IRET": (OF | SF | ZF | AF | PF | CF | IF | DF), "LAR": (ZF), "LOOPZ": (0, ZF), "LOOPNZ": (0, ZF), "LSL": (ZF), "LZCNT": (ZF | CF, 0, OF | SF | AF | PF), "MUL": (OF | CF, 0, SF | ZF | AF | PF), "NEG": (OF | SF | ZF | AF | PF | CF), "OR": (SF | ZF | PF, AF), "POPCNT": (ZF), "POPF": (OF | SF | ZF | AF | PF | CF | IF | DF), "RSM": (OF | SF | ZF | AF | PF | CF | IF | DF), "SAHF": (SF | ZF | AF | PF | CF), "SBB": (OF | SF | ZF | AF | PF | CF, CF), "STC": (CF), "STD": (DF), "STI": (IF), "SUB": (OF | SF | ZF | AF | PF | CF), "TEST": (SF | ZF | PF, 0, AF), "VERR": (ZF), "VERW": (ZF), "XADD": (OF | SF | ZF | AF | PF | CF), "XOR": (SF | ZF | PF, 0, AF), # IO/String instructions: "MOVS": (0, DF), "LODS": (0, DF), "STOS": (0, DF), "CMPS": (OF | SF | ZF | AF | PF | CF, DF), "SCAS": (OF | SF | ZF | AF | PF | CF, DF), "INS": (0, DF), "OUTS": (0, DF) } # Check for mnemonics in the above table. for i in ii.mnemonics: if InstByMnem.has_key(i) and (ii.flags & InstFlag.PSEUDO_OPCODE) == 0: SetInstructionAffectedFlags(ii, InstByMnem[i]) return # Look carefuly for SETcc or Jcc instructions. for i in ["SET", "CMOV", "FCMOV"]: if ii.mnemonics[0].find(i) == 0: SetInstructionAffectedFlags(ii, GetTestedFlagsForCondition(ii.mnemonics[0][len(i):])) return # See if it's a Jcc instruction. if ii.mnemonics[0][:1] == "J" and ii.mnemonics[0][:2] not in ["JM", "JC", "JE", "JR"]: SetInstructionAffectedFlags(ii, GetTestedFlagsForCondition(ii.mnemonics[0][1:])) return # Still no match, try special shift/rotate instructions. # Special shift/rotate instruction that with constant 1 have different flag affections: # First tuple is with constant 1, second tuple is with any count (CL). Shifts = [ (["RCL", "RCR"], (OF | CF, CF), (CF, CF, OF)), (["ROL", "ROR"], (OF | CF), (CF, 0, OF)), (["SAL", "SAR", "SHL", "SHR"], (OF | SF | ZF | PF | CF, 0, AF), (SF | ZF | PF | CF, 0, OF | AF)), (["SHLD", "SHRD"], (OF | SF | ZF | PF | CF, 0, AF), (SF | ZF | PF | CF, 0, OF | AF)) ] for i in Shifts: for j in i[0]: if ii.mnemonics[0] == j: flags = i[1] if ii.operands[1] == OperandType.CONST1 else i[2] SetInstructionAffectedFlags(ii, flags) return # The instruction doesn't affect any flags... return # Table to hold shared inst-info. sharedInfoDict = {} # Table to hold shared flags. flagsDict = {} def FormatInstruction(ii): """ Formats a string with all information relevant for diStorm InstInfo structure or the InstInfoEx. These are the internal structures diStorm uses for holding the instructions' information. Using this structure diStorm knows how to format an opcode when it reads it from the stream. An instruction information structure is found by its byte codes with a prefix of "II_". So for example ADD EAX, Imm32 instruction is II_00. Since there are several types of instructions information structures, the tables which point to these non-default InstInfo structures, will have to cast the pointer. """ # There might be optional fields, if there's a 3rd operand or a second/third mnemonic. optFields = "" # Default type of structure is InstInfo. type = "_InstInfo" # Make sure the instruction can be fully represented using the diStorm3 _DecodeInst structure. CheckOTCollisions(ii) # Add flags for flow control instructions. UpdateForFlowControl(ii) # Add flags for writable destination operand. UpdateWritableDestinationOperand(ii) # Add affected modified/tested/undefined flags for instruction. UpdateInstructionAffectedFlags(ii) # Pad mnemonics to three, in case EXMNEMONIC/2 isn't used (so we don't get an exception). mnems = TranslateMnemonics([None, ii.classType][(ii.flags & InstFlag.PSEUDO_OPCODE) == InstFlag.PSEUDO_OPCODE], ii.mnemonics) + [0, 0] # Mark whether the instruction is privileged, by setting MSB of the OpcodeId field. UpdatePrivilegedInstruction(mnems, ii) # Pad operands to atleast three (so we don't get an exception too, since there might be instructions with no operands at all). ops = ii.operands + [OperandType.NONE, OperandType.NONE, OperandType.NONE, OperandType.NONE] # Is it an extended structure? if ii.flags & InstFlag.EXTENDED: # Since there's a second and/or a third mnemonic, use the the InstInfoEx structure. type = "_InstInfoEx" flagsEx = 0 # Fix flagsEx to have the VEX flags, except PRE_VEX. if ii.flags & InstFlag.PRE_VEX: flagsEx = ii.flags >> InstFlag.FLAGS_EX_START_INDEX # If there's a third operand, use it, otherwise NONE. op3 = [OperandType.NONE, ops[2]][(ii.flags & InstFlag.USE_OP3) == InstFlag.USE_OP3] op4 = [OperandType.NONE, ops[3]][(ii.flags & InstFlag.USE_OP4) == InstFlag.USE_OP4] if flagsEx >= 256: # Assert the size of flagsEx is enough to holds this value. raise "FlagsEx exceeded its 8 bits. Change flagsEx of _InstInfoEx to be uint16!" # Concat the mnemonics and the third operand. optFields = ", 0x%x, %d, %d, %d, %d" % (flagsEx, op3, op4, mnems[1], mnems[2]) # Notice we filter out internal bits from flags. flags = ii.flags & ((1 << InstFlag.FLAGS_EX_START_INDEX)-1) # Allocate a slot for this flag if needed. if not flagsDict.has_key(flags): flagsDict[flags] = len(flagsDict) # Get the flags-index. flagsIndex = flagsDict[flags] if flagsIndex >= 256: raise "FlagsIndex exceeded its 8 bits. Change flags of _InstInfo to be uint16!" # InstSharedInfo: sharedInfo = (flagsIndex, ops[1], ops[0], (ii.classType << 3) | ii.flowControl, ii.modifiedFlags, ii.testedFlags, ii.undefinedFlags) if not sharedInfoDict.has_key(sharedInfo): sharedInfoDict[sharedInfo] = len(sharedInfoDict) # Get the shared-info-index. sharedInfoIndex = sharedInfoDict[sharedInfo] if sharedInfoIndex >= 2**16: raise "SharedInfoIndex exceeded its 16 bits. Change type of sharedInfoIndex in _InstInfo!" fields = "0x%x, %d" % (sharedInfoIndex, mnems[0]) # "Structure-Name" = II_Bytes-Code {Fields + Optional-Fields}. return ("\t/*II%s*/ {%s%s}" % (ii.tag, fields, optFields), (ii.flags & InstFlag.EXTENDED) != 0) def FilterTable(table): # All tables must go to output. return True def GeneratePseudoMnemonicOffsets(): """ Generate the static offset tables for psuedo compare instructions both for SSE and AVX. The table is built in such a way that each cell holds the offset from the first pseudo mnemonic to the indexed one. """ # Lengths of pesudo mnemonics (SSE=CMPxxxYY + null + lengthByte) lengths = map(lambda x: 3 + len(x) + 2 + 2, SSECmpTypes) s = "uint16_t CmpMnemonicOffsets[8] = {\n" + ", ".join([str(sum(lengths[:i] or [0])) for i in xrange(len(lengths))]) + "\n};\n"; # (AVX=VCMPxxxYY + null + lengthByte). lengths = map(lambda x: 4 + len(x) + 2 + 2, AVXCmpTypes) s += "uint16_t VCmpMnemonicOffsets[32] = {\n" + ", ".join([str(sum(lengths[:i] or [0])) for i in xrange(len(lengths))]) + "\n};"; return s def CreateTables(db): """ This is the new tables generator code as for May 2011. Its purpose is to return all tables and structures ready to use at once by diStorm. The information is divided into 3 categories (arrays): 1) The InstructionsTree root table, which holds all id's (InstNode) and refelects a tree, inside a flat array. 2) The InstInfos table, which holds all Instruction-Information structures - the actual (basic) info per instruction. 3) The InstInfosEx table, which holds all extended Instruction-Information structures. Each array should be flat one defined only once. This means that we need to serialize all instruction-set tables into a single table of pointers, kinda. This pointer is now a InstNode, which is really a 16 bits structure. The low 13 bits are an index. The upper 3 bits are the type of what the index points to. So basically, an index can be up to 8192 which is good enough as for now, cause we only have around ~5k entries in the tree. However, it can be an index into the InstInfos or InstInfosEx tables, depends on the type. A note from Feb 2007 - This new data layout in contrast with the old data layout saves more memory space (~12KB). This new serialization should even save around 25kb! Because now we don't use real pointers anymore, only this tiny formatted InstNode. The new method uses the last method, but instead of dividing the tree into many id's and pointer's tables, it will now concatenate them all into the relevant tables. And instead of a real pointer to an Instruction-Information structure, we will use an index into each table. For example, say we have the following instructions table (byte code and mnemonic): 0 - AND 1 - XOR 2 - OR 3 - EMPTY (NO-INSTRUCTION-IS-ENCODED) 4 - EMPTY 5 - SHL 6 - SHR 7 - EMPTY Old Documentation: ------------------ So instead of generating the following old data layout: {&II_00, &II_01, &II_02, NULL, NULL, &II_05, &II_06, NULL} (Actually the old layout is a bit more complicated and consumes another byte for indicating the type of node.) Anyways, we can generate the follow table: {1, 2, 3, 0, 0, 4, 5, 0} This time the table is in bytes, a byte is enough to index 256 instructions (which is a Full sized table). However, an id's table is not enough, we need another table, the pointers table, which will look like this (following the above example): {NULL, &II_00, &II_01, &II_02, &II_05, &II_06} Note that if there are no EMPTY instructions in the table the first NULL entry will be omitted! Assuming most of the space we managed to spare goes for telling diStorm "hey, this instruction is not encoded", we spared around 12KB. So all empty instructions points to the same first entry inside its corresponding pointers table. This way we pay another array of bytes for each table, but eliminate all NULL's. So the actual node looks something like this: {8, &table_00_ids, &table_00_pointers} Which costs another dereference inside diStorm decoder. New Documentation: ------------------ As you can see, I did a pass back in 2007 to spare some empty entries in the tables. But I kept using real pointers, which took lots of space. This time, I am going to use a flat array which will represent the whole tree. And combine all data into arrays, and spare even the old InstNode which was a small structure that says what's the type of the table it points to. This type stuff will now be embedded inside the InstNode integer. The new tables look like this (according to the above example): InstInfo InstInfos[] = { {AND info...}, {XOR info...}, {OR info...}, {SHL info...}, {SHR info...} }; And another InstNodes table: InstNode InstructionsTree[] = { 0 | INSTINFO << 13, 1 | INSTINFO << 13, 2 | INSTINFO << 13, -1, -1, 3 | INSTINFO << 13, 4 | INSTINFO << 13, -1, }; The example happened to be a single table. But suppose there's another index which points to another table in the tree, it would look like: {TableIndexInInstructionsTree | TABLE << 13} This way we know to read another byte and follow the next table... :!:NOTE:!: You MUST iterate a table with GenBlock wrapper, otherwise you might NOT get all instructions from the DB! Refer to x86db.py-class GenBlock for more information. """ indexShift = 13 # According to InstNode in instructions.h. InstInfos = [] InstInfosEx = [] InstructionsTree = [] externTables = [] nextTableIndex = 256 # Root tree takes 256 nodes by default, so skip them. # Scan all tables in the DB. for x in db.GenerateTables(FilterTable): # Don't make static definitions for specific exported tables. if x.tag in ["_0F_0F", "_0F", "_0F_3A", "_0F_38"]: # Store the index of these special tables, they are used directly in instructions.c. externTables.append((x.tag, len(InstructionsTree))) # Notice we use GenBlock for the special instructions, this is a must, otherwise we miss instructions from the DB. for i in x86db.GenBlock(x): if isinstance(i, x86db.InstructionInfo): formattedII, isExtended = FormatInstruction(i) if isExtended: InstInfosEx.append(formattedII) index = len(InstInfosEx) - 1 InstructionsTree.append((NodeType.INFOEX << indexShift | index, i.tag)) else: InstInfos.append(formattedII) index = len(InstInfos) - 1 InstructionsTree.append((NodeType.INFO << indexShift | index, i.tag)) elif isinstance(i, x86db.InstructionsTable): InstructionsTree.append(((i.type << indexShift) | nextTableIndex, i.tag)) nextTableIndex += i.size # This assumes we walk on the instructions tables in BFS order! else: # False indicates this entry points nothing. InstructionsTree.append((0, "")) s = ["\n".join(["_InstInfo II_%s =%s;" % (i.mnemonics[0] if i.mnemonics[0][0] != '_' else i.mnemonics[0][1:], FormatInstruction(i)[0]) for i in db.getExportedInstructions()]), "_iflags FlagsTable[%d] = {\n%s\n};" % (len(flagsDict), ",\n".join(["0x%x" % i[1] for i in sorted(zip(flagsDict.values(), flagsDict.keys()))])), "\n".join(["_InstNode Table%s = %d;" % (i[0], i[1]) for i in externTables]), "_InstInfo InstInfos[%d] = {\n%s\n};" % (len(InstInfos), ",\n".join(InstInfos)), "_InstInfoEx InstInfosEx[%d] = {\n%s\n};" % (len(InstInfosEx), ",\n".join(InstInfosEx)), "_InstNode InstructionsTree[%d] = {\n%s\n};" % (len(InstructionsTree), ",\n".join(["/* %x - %s */ %s" % (i[0], i[1][1], "0" if i[1][0] == 0 else "0x%x" % i[1][0]) for i in enumerate(InstructionsTree)])), # sharedInfoDict must be evaluated last, since the exported instructions above add items to it! "_InstSharedInfo InstSharedInfoTable[%d] = {\n%s\n};" % (len(sharedInfoDict), ",\n".join(["{%s}" % str(i[1])[1:-1] for i in sorted(zip(sharedInfoDict.values(), sharedInfoDict.keys()))])), GeneratePseudoMnemonicOffsets()] return "\n\n".join(s) def main(): # Init the 80x86/x64 instructions sets DB. db = x86db.InstructionsDB() x86InstructionsSet = x86sets.Instructions(db.SetInstruction) # Open file for output. f = open("output.txt", "w") f.write("/*\n * GENERATED BY disOps at %s\n */\n\n" % time.asctime()) # Generate all tables of id's and pointers with the instructions themselves. lists = CreateTables(db) # Write them to the file also. f.write(lists) f.close() DumpMnemonics() print "The file output.txt was written successfully" main()