283 lines
8.2 KiB
Python
283 lines
8.2 KiB
Python
from idc import *
|
|
import sys
|
|
import binascii
|
|
from sets import Set
|
|
import sys
|
|
import re
|
|
|
|
def replace_farptr(matchobj):
|
|
s1 = matchobj.group()
|
|
return s1.replace('far ptr ', '')
|
|
|
|
def replace_ptr(matchobj):
|
|
s1 = matchobj.group()
|
|
pos = s1.rfind(' ')
|
|
assert pos != -1
|
|
return s1[0:pos] + ' [' + s1[pos+1:] + ']'
|
|
|
|
def replace_ptr2(matchobj):
|
|
s1 = matchobj.group()
|
|
pos = s1.rfind(':')
|
|
assert pos != -1
|
|
return s1[0:pos+1] + '[' + s1[pos+1:] + ']'
|
|
|
|
def replace_hex(matchobj):
|
|
nstr = matchobj.group()
|
|
assert not (nstr in ['ah', 'bh', 'ch', 'dh'])
|
|
nstr = nstr.replace('h', '')
|
|
return nstr
|
|
|
|
def repl(matchobj):
|
|
s1 = matchobj.group()
|
|
s2 = s1.replace(' ', '')
|
|
pos1 = s2.find('[')
|
|
pos2 = s2.find(']')
|
|
if pos1 == -1 or pos2 == -1:
|
|
return s1
|
|
nstr = s2[0:pos1]
|
|
nstr = nstr.replace('h', '')
|
|
num = int(nstr, 16)
|
|
hex = "%08x" % num
|
|
s3 = s2[pos1:pos2] + '+' + hex + ']'
|
|
return s3
|
|
|
|
p_seg = re.compile(r'(es:|ds:|cs:|fs:|gs:|ss:)')
|
|
p_seg_abs = re.compile(r'(es:|ds:|cs:|fs:|gs:|ss:)[0-9][0-9a-fA-F]*h')
|
|
p_farptr = re.compile(r'far\sptr\s[0-9a-fA-F]+:[0-9a-fA-F]+')
|
|
p_ptr = re.compile(r'(byte|word|dword|qword|oword|fword)\sptr\s[0-9][0-9a-fA-F]*')
|
|
p_ptr2 = re.compile(r'(byte|word|dword|qword|oword|fword)\sptr\s(es:|ds:|cs:|fs:|gs:|ss:)[0-9][0-9a-fA-F]*')
|
|
p_hex = re.compile(r'([0-9][0-9a-fA-F]*)(h|H)')
|
|
p_spaces = re.compile(r'(\s+)')
|
|
p_repl = re.compile(r'[0-9][0-9a-fA-F]*(h|H)\s*\[[^\]]+\]')
|
|
|
|
replacements = [('retn', 'ret'), ('retnw', 'ret'), ('iretw', 'iret'), ('retfw', 'retf'),
|
|
('pushfw', 'pushf'), ('popfw', 'popf'), ('pushaw', 'pusha'),
|
|
('popaw', 'popa'), ('enterw', 'enter'), ('enterw', 'enter'),
|
|
('cmova', 'cmovnbe'), ('cmovg', 'cmovnle'), ('cmovge', 'cmovnl'),
|
|
('leavew', 'leave'),
|
|
('int 3', 'int 03')]
|
|
|
|
def is_invalid_insn(insn_binary):
|
|
k = 0
|
|
while True:
|
|
b = ord(insn_binary[k])
|
|
if not(b == 0x26 or b == 0x2e or b == 0x36 or b == 0x3e or b == 0x64 or b == 0x65):
|
|
break
|
|
k += 1
|
|
if k >= len(insn_binary) - 1:
|
|
return True
|
|
b = ord(insn_binary[k])
|
|
b2 = ord(insn_binary[k + 1])
|
|
if b == 0x0f and (b2 == 0x19 or b2 == 0x24 or b2 == 0x26 or b2 == 0xa6 or b2 == 0xa7):
|
|
return True
|
|
if b == 0xcd and b2 == 0x20: #vxdcall
|
|
return True
|
|
if b == 0xd6: #setalc
|
|
return True
|
|
if b == 0x0f and b2 == 0x0d:
|
|
if k > len(insn_binary) - 2:
|
|
return True
|
|
if ord(insn_binary[k + 2]) == 0x13:
|
|
return True
|
|
return False
|
|
|
|
# Miscellaneous replacements
|
|
def misc_replacements(opcode_str):
|
|
for r_from, r_to in replacements:
|
|
if opcode_str == r_from:
|
|
opcode_str = r_to
|
|
return opcode_str
|
|
|
|
def remove_ds_prefix(insn_binary, rest_str):
|
|
pos = insn_binary.find('\x3e')
|
|
# No prefix - remove ds:
|
|
if -1 == pos:
|
|
return -1 != rest_str.find('ds:')
|
|
if pos == 0:
|
|
return True
|
|
ds_prefix = True
|
|
for i in range(0, pos):
|
|
if not (ord(insn_binary[i]) in [0x66, 0x67, 0xF0, 0xF2, 0xF3]):
|
|
ds_prefix = False
|
|
break
|
|
if ds_prefix:
|
|
return True
|
|
else:
|
|
# No prefix - remove ds
|
|
return -1 != rest_str.find('ds:')
|
|
|
|
def replace_ds_seg(matchobj):
|
|
nstr = matchobj.group()
|
|
if nstr[0:2].lower() == 'ds':
|
|
return '[' + nstr[3:] + ']'
|
|
else:
|
|
return nstr[0:3] + '[' + nstr[3:] + ']'
|
|
|
|
def replace_seg(matchobj):
|
|
nstr = matchobj.group()
|
|
if nstr[0:2].lower() == 'ds':
|
|
return '[' + nstr[3:] + ']'
|
|
else:
|
|
return nstr[0:3] + '[' + nstr[3:] + ']'
|
|
|
|
def replace_lea_seg(matchobj):
|
|
nstr = matchobj.group()
|
|
return '[' + nstr[3:] + ']'
|
|
|
|
def replace_segments(insn_binary, opcode_str, rest_str):
|
|
# Remove segments from LEA (for absolute and relative offsets)
|
|
if opcode_str.lower() == 'lea':
|
|
tmp = p_seg_abs.sub(replace_lea_seg, rest_str)
|
|
if tmp == rest_str:
|
|
return p_seg.sub('', rest_str)
|
|
else:
|
|
return tmp
|
|
# Now search for ?s:01020304, replace to ?s:[01020304] except of ds: -> [01020304]
|
|
if remove_ds_prefix(insn_binary, rest_str):
|
|
return p_seg_abs.sub(replace_ds_seg, rest_str)
|
|
else:
|
|
return p_seg_abs.sub(replace_seg, rest_str)
|
|
|
|
# Apply fixes to IDA opcode
|
|
def ida_disasm_fix(insn_binary, insn_str):
|
|
# Remove extra spaces and tabs. Replace tabs with spaces
|
|
insn_str = p_spaces.sub(r' ', insn_str)
|
|
# Avoid opcode changing
|
|
pos = insn_str.find(' ')
|
|
if pos == -1:
|
|
return misc_replacements(insn_str) # This is opcode like 'cli'
|
|
opcode_str = insn_str[0:pos]
|
|
rest_str = insn_str[pos+1:]
|
|
# remove 'small'
|
|
rest_str = rest_str.replace('small ', '')
|
|
# Transform '6050403[eax], al' to '[eax+6050403], al'
|
|
rest_str = p_repl.sub(repl, rest_str)
|
|
rest_str = replace_segments(insn_binary, opcode_str, rest_str)
|
|
# Remove 'ds:' if no 3Eh prefix found
|
|
if remove_ds_prefix(insn_binary, rest_str):
|
|
rest_str = rest_str.replace('ds:', '')
|
|
# Replace 'xmmword' to 'oword'
|
|
rest_str = rest_str.replace('xmmword', 'oword')
|
|
# Remove 'h' after hex constants
|
|
rest_str = p_hex.sub(replace_hex, rest_str)
|
|
# Transform 'ptr 012345' -> 'ptr [012345]'
|
|
rest_str = p_ptr.sub(replace_ptr, rest_str)
|
|
rest_str = p_ptr2.sub(replace_ptr2, rest_str)
|
|
# Transform 'call far ptr 1817:16151413' -> 'call 1817:16151413'
|
|
rest_str = p_farptr.sub(replace_farptr, rest_str)
|
|
opcode_str = misc_replacements(opcode_str)
|
|
return opcode_str + ' ' + rest_str
|
|
|
|
def get_insn(ea, len):
|
|
s = ''
|
|
for i in range(0, len):
|
|
s += chr(Byte(ea + i))
|
|
return s
|
|
|
|
def insn_write(f, insn_binary, insn_str, header):
|
|
assert len(insn_binary) != 0
|
|
s = ''
|
|
sz = len(insn_binary)
|
|
if header:
|
|
s += '{%d, "' % sz
|
|
for i in range(0, sz):
|
|
s += '\\x%02x' % ord(insn_binary[i])
|
|
s += '", "' + insn_str + '"},\n'
|
|
else:
|
|
s += binascii.hexlify(insn_binary)
|
|
s += (' %s\n' % insn_str)
|
|
f.write(s)
|
|
f.flush()
|
|
|
|
# Normalize operand (replace numeric operands with -1)
|
|
def normalize_operand(op_type, op_str):
|
|
if op_type in [o_mem, o_displ, o_imm, o_near, o_far]:
|
|
return "-1"
|
|
else:
|
|
return op_str
|
|
|
|
def is_unique(set, ea, insn_str, mnem):
|
|
# Consider undisassemblable opcodes as unique
|
|
if insn_str[0:2].lower() == 'db':
|
|
return True
|
|
ot1 = GetOpType(ea, 0)
|
|
ot2 = GetOpType(ea, 1)
|
|
ot3 = GetOpType(ea, 2)
|
|
v1 = GetOpnd(ea, 0)
|
|
v2 = GetOpnd(ea, 1)
|
|
v3 = GetOpnd(ea, 2)
|
|
hashstr = "%s|%s|%s|%s" % (mnem,
|
|
normalize_operand(ot1, v1),
|
|
normalize_operand(ot2, v2),
|
|
normalize_operand(ot3, v3))
|
|
if hashstr in set:
|
|
return False
|
|
else:
|
|
set.add(hashstr)
|
|
return True
|
|
|
|
def generate_x86(filename):
|
|
set = Set()
|
|
ea = GetEntryPoint(GetEntryOrdinal(0))
|
|
for i in range(0, 20):
|
|
PatchByte(ea + i, i + 0x10)
|
|
flog = open(filename + '.log', 'wt')
|
|
f = open(filename, 'wt')
|
|
n = 0
|
|
len = 0
|
|
for p0 in range(0x10, 0x110):
|
|
q0 = p0 & 0xff
|
|
set.clear() # Clear cached of opcodes (they become unrelevant)
|
|
PatchByte(ea, q0)
|
|
for p1 in range(0x10, 0x110):
|
|
q1 = p1 & 0xff
|
|
PatchByte(ea + 1, q1)
|
|
for p2 in range(0x10, 0x110):
|
|
q2 = p2 & 0xff
|
|
PatchByte(ea + 2, q2)
|
|
len = MakeCode(ea)
|
|
str = GetDisasm(ea)
|
|
mnem = GetMnem(ea)
|
|
# Now we got disasm
|
|
# Remove comments
|
|
pos = str.find(';')
|
|
if pos != -1:
|
|
str = str[0:pos]
|
|
# Remove spaces at start and end
|
|
str = str.strip(' ')
|
|
if str[0:2] == 'db' or str == '' or len == 0:
|
|
insn_binary = get_insn(ea, 10)
|
|
flog.write('INPUT hex: %20s; disasm: "%s"\n' % (binascii.hexlify(insn_binary), str))
|
|
insn_write(f, insn_binary, 'db', False)
|
|
flog.write('\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tOUTPUT disasm: "db"\n')
|
|
continue
|
|
if not is_unique(set, ea, str, mnem):
|
|
continue
|
|
insn_binary = get_insn(ea, len)
|
|
flog.write('INPUT hex: %20s; disasm: "%s"\n' % (binascii.hexlify(insn_binary), str))
|
|
if is_invalid_insn(insn_binary):
|
|
flog.write('*** Skipping invalid opcode ***')
|
|
continue
|
|
# Add unique disasms to file
|
|
str = ida_disasm_fix(insn_binary, str)
|
|
flog.write('\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tOUTPUT disasm: "%s"\n' % str)
|
|
flog.flush()
|
|
insn_write(f, insn_binary, str, False)
|
|
n += 1
|
|
if n % 1000 == 0:
|
|
print '%d opcodes processed' % n
|
|
if len == 2 or len == 1:
|
|
break # Optimization: break if third byte does not matter
|
|
if len == 1:
|
|
break # Optimization: break if second byte does not matter
|
|
f.close()
|
|
flog.write('Finished\n')
|
|
flog.close()
|
|
|
|
def main():
|
|
generate_x86("./intel-x86-opcodes.txt")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|