view gen_x86.c @ 995:2bc27415565b

Fix some stuff with interrupt timing. The change in adjust_int_cycle gets Overdrive working again (vint was not being preferred over hint in some cases). One of the changes seems to have broken Fatal Rewind again, but no other regressions that I can see.
author Michael Pavone <pavone@retrodev.com>
date Sat, 30 Apr 2016 08:37:55 -0700
parents dc71e32091d8
children cd6048e0397b
line wrap: on
line source

/*
 Copyright 2013 Michael Pavone
 This file is part of BlastEm.
 BlastEm is free software distributed under the terms of the GNU General Public License version 3 or greater. See COPYING for full license text.
*/
#include "gen_x86.h"
#include "mem.h"
#include "util.h"
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

#define REX_RM_FIELD 0x1
#define REX_SIB_FIELD 0x2
#define REX_REG_FIELD 0x4
#define REX_QUAD 0x8

#define OP_ADD 0x00
#define OP_OR  0x08
#define PRE_2BYTE 0x0F
#define OP_ADC 0x10
#define OP_SBB 0x18
#define OP_AND 0x20
#define OP_SUB 0x28
#define OP_XOR 0x30
#define OP_CMP 0x38
#define PRE_REX 0x40
#define OP_PUSH 0x50
#define OP_POP 0x58
#define OP_MOVSXD 0x63
#define PRE_SIZE 0x66
#define OP_JCC 0x70
#define OP_IMMED_ARITH 0x80
#define OP_TEST 0x84
#define OP_XCHG 0x86
#define OP_MOV 0x88
#define PRE_XOP 0x8F
#define OP_XCHG_AX 0x90
#define OP_CDQ 0x99
#define OP_PUSHF 0x9C
#define OP_POPF 0x9D
#define OP_MOV_I8R 0xB0
#define OP_MOV_IR 0xB8
#define OP_SHIFTROT_IR 0xC0
#define OP_RETN 0xC3
#define OP_MOV_IEA 0xC6
#define OP_SHIFTROT_1 0xD0
#define OP_SHIFTROT_CL 0xD2
#define OP_LOOP 0xE2
#define OP_CALL 0xE8
#define OP_JMP 0xE9
#define OP_JMP_BYTE 0xEB
#define OP_NOT_NEG 0xF6
#define OP_SINGLE_EA 0xFF

#define OP2_JCC 0x80
#define OP2_SETCC 0x90
#define OP2_BT 0xA3
#define OP2_BTS 0xAB
#define OP2_IMUL 0xAF
#define OP2_BTR 0xB3
#define OP2_BTX_I 0xBA
#define OP2_BTC 0xBB
#define OP2_MOVSX 0xBE
#define OP2_MOVZX 0xB6

#define OP_EX_ADDI 0x0
#define OP_EX_ORI  0x1
#define OP_EX_ADCI 0x2
#define OP_EX_SBBI 0x3
#define OP_EX_ANDI 0x4
#define OP_EX_SUBI 0x5
#define OP_EX_XORI 0x6
#define OP_EX_CMPI 0x7

#define OP_EX_ROL 0x0
#define OP_EX_ROR 0x1
#define OP_EX_RCL 0x2
#define OP_EX_RCR 0x3
#define OP_EX_SHL 0x4
#define OP_EX_SHR 0x5
#define OP_EX_SAL 0x6 //identical to SHL
#define OP_EX_SAR 0x7

#define OP_EX_BT  0x4
#define OP_EX_BTS 0x5
#define OP_EX_BTR 0x6
#define OP_EX_BTC 0x7

#define OP_EX_TEST_I 0x0
#define OP_EX_NOT    0x2
#define OP_EX_NEG    0x3
#define OP_EX_MUL    0x4
#define OP_EX_IMUL   0x5
#define OP_EX_DIV    0x6
#define OP_EX_IDIV   0x7

#define OP_EX_INC     0x0
#define OP_EX_DEC     0x1
#define OP_EX_CALL_EA 0x2
#define OP_EX_JMP_EA  0x4
#define OP_EX_PUSH_EA 0x6

#define BIT_IMMED_RAX 0x4
#define BIT_DIR 0x2
#define BIT_SIZE 0x1


enum {
	X86_RAX = 0,
	X86_RCX,
	X86_RDX,
	X86_RBX,
	X86_RSP,
	X86_RBP,
	X86_RSI,
	X86_RDI,
	X86_AH=4,
	X86_CH,
	X86_DH,
	X86_BH,
	X86_R8=0,
	X86_R9,
	X86_R10,
	X86_R11,
	X86_R12,
	X86_R13,
	X86_R14,
	X86_R15
} x86_regs_enc;

char * x86_reg_names[] = {
#ifdef X86_64
	"rax",
	"rcx",
	"rdx",
	"rbx",
	"rsp",
	"rbp",
	"rsi",
	"rdi",
#else
	"eax",
	"ecx",
	"edx",
	"ebx",
	"esp",
	"ebp",
	"esi",
	"edi",
#endif
	"ah",
	"ch",
	"dh",
	"bh",
	"r8",
	"r9",
	"r10",
	"r11",
	"r12",
	"r13",
	"r14",
	"r15",
};

char * x86_sizes[] = {
	"b", "w", "d", "q"
};

void jmp_nocheck(code_info *code, code_ptr dest)
{
	code_ptr out = code->cur;
	ptrdiff_t disp = dest-(out+2);
	if (disp <= 0x7F && disp >= -0x80) {
		*(out++) = OP_JMP_BYTE;
		*(out++) = disp;
	} else {
		disp = dest-(out+5);
		if (disp <= 0x7FFFFFFF && disp >= -2147483648) {
			*(out++) = OP_JMP;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
		} else {
			fatal_error("jmp: %p - %p = %l which is out of range of a 32-bit displacementX\n", dest, out + 6, (long)disp);
		}
	}
	code->cur = out;
}

void check_alloc_code(code_info *code, uint32_t inst_size)
{
	if (code->cur + inst_size > code->last) {
		size_t size = CODE_ALLOC_SIZE;
		code_ptr next_code = alloc_code(&size);
		if (!next_code) {
			fatal_error("Failed to allocate memory for generated code\n");
		}
		if (next_code != code->last + RESERVE_WORDS) {
			//new chunk is not contiguous with the current one
			jmp_nocheck(code, next_code);
			code->cur = next_code;
			code->last = next_code + size/sizeof(RESERVE_WORDS);
		}
		code->last = next_code + size/sizeof(code_word) - RESERVE_WORDS;
	}
}

void x86_rr_sizedir(code_info *code, uint16_t opcode, uint8_t src, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	uint8_t tmp;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_B && dst >= RSP && dst <= RDI) {
		opcode |= BIT_DIR;
		tmp = dst;
		dst = src;
		src = tmp;
	}
	if (size == SZ_Q || src >= R8 || dst >= R8 || (size == SZ_B && src >= RSP && src <= RDI)) {
#ifdef X86_64
		*out = PRE_REX;
		if (src >= AH && src <= BH || dst >= AH && dst <= BH) {
			fatal_error("attempt to use *H reg in an instruction requiring REX prefix. opcode = %X\n", opcode);
		}
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_REG_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X, src: %s, dst: %s, size: %s\n", opcode, x86_reg_names[src], x86_reg_names[dst], x86_sizes[size]);
#endif
	}
	if (size == SZ_B) {
		if (src >= AH && src <= BH) {
			src -= (AH-X86_AH);
		}
		if (dst >= AH && dst <= BH) {
			dst -= (AH-X86_AH);
		}
	} else {
		opcode |= BIT_SIZE;
	}
	if (opcode >= 0x100) {
		*(out++) = opcode >> 8;
		*(out++) = opcode;
	} else {
		*(out++) = opcode;
	}
	*(out++) = MODE_REG_DIRECT | dst | (src << 3);
	code->cur = out;
}

void x86_rrdisp_sizedir(code_info *code, uint16_t opcode, uint8_t reg, uint8_t base, int32_t disp, uint8_t size, uint8_t dir)
{
	check_alloc_code(code, 10);
	code_ptr out = code->cur;
	//TODO: Deal with the fact that AH, BH, CH and DH can only be in the R/M param when there's a REX prefix
	uint8_t tmp;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || reg >= R8 || base >= R8 || (size == SZ_B && reg >= RSP && reg <= RDI)) {
#ifdef X86_64
		*out = PRE_REX;
		if (reg >= AH && reg <= BH) {
			fatal_error("attempt to use *H reg in an instruction requiring REX prefix. opcode = %X\n", opcode);
		}
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (reg >= R8) {
			*out |= REX_REG_FIELD;
			reg -= (R8 - X86_R8);
		}
		if (base >= R8) {
			*out |= REX_RM_FIELD;
			base -= (R8 - X86_R8);
		}
		out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X, reg: %s, base: %s, size: %s\n", opcode, x86_reg_names[reg], x86_reg_names[base], x86_sizes[size]);
#endif
	}
	if (size == SZ_B) {
		if (reg >= AH && reg <= BH) {
			reg -= (AH-X86_AH);
		}
	} else {
		opcode |= BIT_SIZE;
	}
	opcode |= dir;
	if (opcode >= 0x100) {
		*(out++) = opcode >> 8;
		*(out++) = opcode;
	} else {
		*(out++) = opcode;
	}
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | base | (reg << 3);
	} else {
		*(out++) = MODE_REG_DISPLACE32 | base | (reg << 3);
	}
	if (base == RSP) {
		//add SIB byte, with no index and RSP as base
		*(out++) = (RSP << 3) | RSP;
	}
	*(out++) = disp;
	if (disp >= 128 || disp < -128) {
	*(out++) = disp >> 8;
	*(out++) = disp >> 16;
	*(out++) = disp >> 24;
	}
	code->cur = out;
}

void x86_rrind_sizedir(code_info *code, uint8_t opcode, uint8_t reg, uint8_t base, uint8_t size, uint8_t dir)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	//TODO: Deal with the fact that AH, BH, CH and DH can only be in the R/M param when there's a REX prefix
	uint8_t tmp;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || reg >= R8 || base >= R8 || (size == SZ_B && reg >= RSP && reg <= RDI)) {
#ifdef X86_64
		*out = PRE_REX;
		if (reg >= AH && reg <= BH) {
			fatal_error("attempt to use *H reg in an instruction requiring REX prefix. opcode = %X\n", opcode);
		}
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (reg >= R8) {
			*out |= REX_REG_FIELD;
			reg -= (R8 - X86_R8);
		}
		if (base >= R8) {
			*out |= REX_RM_FIELD;
			base -= (R8 - X86_R8);
		}
		out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X, reg: %s, base: %s, size: %s\n", opcode, x86_reg_names[reg], x86_reg_names[base], x86_sizes[size]);
#endif
	}
	if (size == SZ_B) {
		if (reg >= AH && reg <= BH) {
			reg -= (AH-X86_AH);
		}
	} else {
		opcode |= BIT_SIZE;
	}
	*(out++) = opcode | dir;
	if (base == RBP) {
		//add a dummy 8-bit displacement since MODE_REG_INDIRECT with
		//an R/M field of RBP selects RIP, relative addressing
		*(out++) = MODE_REG_DISPLACE8 | base | (reg << 3);
		*(out++) = 0;
	} else {
	*(out++) = MODE_REG_INDIRECT | base | (reg << 3);
	if (base == RSP) {
		//add SIB byte, with no index and RSP as base
		*(out++) = (RSP << 3) | RSP;
	}
	}
	code->cur = out;
}

void x86_rrindex_sizedir(code_info *code, uint8_t opcode, uint8_t reg, uint8_t base, uint8_t index, uint8_t scale, uint8_t size, uint8_t dir)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	//TODO: Deal with the fact that AH, BH, CH and DH can only be in the R/M param when there's a REX prefix
	uint8_t tmp;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || reg >= R8 || base >= R8 || (size == SZ_B && reg >= RSP && reg <= RDI)) {
#ifdef X86_64
		*out = PRE_REX;
		if (reg >= AH && reg <= BH) {
			fatal_error("attempt to use *H reg in an instruction requiring REX prefix. opcode = %X\n", opcode);
		}
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (reg >= R8) {
			*out |= REX_REG_FIELD;
			reg -= (R8 - X86_R8);
		}
		if (base >= R8) {
			*out |= REX_RM_FIELD;
			base -= (R8 - X86_R8);
		}
		if (index >= R8) {
			*out |= REX_SIB_FIELD;
			index -= (R8 - X86_R8);
		}
		out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X, reg: %s, base: %s, size: %s\n", opcode, x86_reg_names[reg], x86_reg_names[base], x86_sizes[size]);
#endif
	}
	if (size == SZ_B) {
		if (reg >= AH && reg <= BH) {
			reg -= (AH-X86_AH);
		}
	} else {
		opcode |= BIT_SIZE;
	}
	*(out++) = opcode | dir;
	*(out++) = MODE_REG_INDIRECT | RSP | (reg << 3);
	if (scale == 4) {
		scale = 2;
	} else if(scale == 8) {
			scale = 3;
	} else {
		scale--;
	}
	*(out++) = scale << 6 | (index << 3) | base;
	code->cur = out;
}

void x86_r_size(code_info *code, uint8_t opcode, uint8_t opex, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 4);
	code_ptr out = code->cur;
	uint8_t tmp;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8) {
#ifdef X86_64
		*out = PRE_REX;
		if (dst >= AH && dst <= BH) {
			fatal_error("attempt to use *H reg in an instruction requiring REX prefix. opcode = %X\n", opcode);
		}
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X:%X, reg: %s, size: %s\n", opcode, opex, x86_reg_names[dst], x86_sizes[size]);
#endif
	}
	if (size == SZ_B) {
		if (dst >= AH && dst <= BH) {
			dst -= (AH-X86_AH);
		}
	} else {
		opcode |= BIT_SIZE;
	}
	*(out++) = opcode;
	*(out++) = MODE_REG_DIRECT | dst | (opex << 3);
	code->cur = out;
}

void x86_rdisp_size(code_info *code, uint8_t opcode, uint8_t opex, uint8_t dst, int32_t disp, uint8_t size)
{
	check_alloc_code(code, 7);
	code_ptr out = code->cur;
	uint8_t tmp;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8) {
#ifdef X86_64
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X:%X, reg: %s, size: %s\n", opcode, opex, x86_reg_names[dst], x86_sizes[size]);
#endif
	}
	if (size != SZ_B) {
		opcode |= BIT_SIZE;
	}
	*(out++) = opcode;
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst | (opex << 3);
	*(out++) = disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | dst | (opex << 3);
		*(out++) = disp;
		*(out++) = disp >> 8;
		*(out++) = disp >> 16;
		*(out++) = disp >> 24;
	}
	code->cur = out;
}

void x86_ir(code_info *code, uint8_t opcode, uint8_t op_ex, uint8_t al_opcode, int32_t val, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 8);
	code_ptr out = code->cur;
	uint8_t sign_extend = 0;
	if (opcode != OP_NOT_NEG && (size == SZ_D || size == SZ_Q) && val <= 0x7F && val >= -0x80) {
		sign_extend = 1;
		opcode |= BIT_DIR;
	}
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (dst == RAX && !sign_extend) {
		if (size != SZ_B) {
			al_opcode |= BIT_SIZE;
			if (size == SZ_Q) {
#ifdef X86_64
				*out = PRE_REX | REX_QUAD;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X, reg: %s, size: %s\n", al_opcode, x86_reg_names[dst], x86_sizes[size]);
#endif
			}
		}
		*(out++) = al_opcode | BIT_IMMED_RAX;
	} else {
		if (size == SZ_Q || dst >= R8 || (size == SZ_B && dst >= RSP && dst <= RDI)) {
#ifdef X86_64
			*out = PRE_REX;
			if (size == SZ_Q) {
				*out |= REX_QUAD;
			}
			if (dst >= R8) {
				*out |= REX_RM_FIELD;
				dst -= (R8 - X86_R8);
			}
			out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X:%X, reg: %s, size: %s\n", opcode, op_ex, x86_reg_names[dst], x86_sizes[size]);
#endif
		}
		if (dst >= AH && dst <= BH) {
			dst -= (AH-X86_AH);
		}
		if (size != SZ_B) {
			opcode |= BIT_SIZE;
		}
		*(out++) = opcode;
		*(out++) = MODE_REG_DIRECT | dst | (op_ex << 3);
	}
	*(out++) = val;
	if (size != SZ_B && !sign_extend) {
		val >>= 8;
		*(out++) = val;
		if (size != SZ_W) {
			val >>= 8;
			*(out++) = val;
			val >>= 8;
			*(out++) = val;
		}
	}
	code->cur = out;
}

void x86_irdisp(code_info *code, uint8_t opcode, uint8_t op_ex, int32_t val, uint8_t dst, int32_t disp, uint8_t size)
{
	check_alloc_code(code, 12);
	code_ptr out = code->cur;
	uint8_t sign_extend = 0;
	if ((size == SZ_D || size == SZ_Q) && val <= 0x7F && val >= -0x80) {
		sign_extend = 1;
		opcode |= BIT_DIR;
	}
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}

	if (size == SZ_Q || dst >= R8) {
#ifdef X86_64
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
#else
		fatal_error("Instruction requires REX prefix but this is a 32-bit build | opcode: %X:%X, reg: %s, size: %s\n", opcode, op_ex, x86_reg_names[dst], x86_sizes[size]);
#endif
	}
	if (size != SZ_B) {
		opcode |= BIT_SIZE;
	}
	*(out++) = opcode;
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst | (op_ex << 3);
	*(out++) = disp;
	} else {
	*(out++) = MODE_REG_DISPLACE32 | dst | (op_ex << 3);
	*(out++) = disp;
	disp >>= 8;
	*(out++) = disp;
	disp >>= 8;
	*(out++) = disp;
	disp >>= 8;
	*(out++) = disp;
	}
	*(out++) = val;
	if (size != SZ_B && !sign_extend) {
		val >>= 8;
		*(out++) = val;
		if (size != SZ_W) {
			val >>= 8;
			*(out++) = val;
			val >>= 8;
			*(out++) = val;
		}
	}
	code->cur = out;
}

void x86_shiftrot_ir(code_info *code, uint8_t op_ex, uint8_t val, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || (size == SZ_B && dst >= RSP && dst <= RDI)) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (dst >= AH && dst <= BH) {
		dst -= (AH-X86_AH);
	}

	*(out++) = (val == 1 ? OP_SHIFTROT_1: OP_SHIFTROT_IR) | (size == SZ_B ? 0 : BIT_SIZE);
	*(out++) = MODE_REG_DIRECT | dst | (op_ex << 3);
	if (val != 1) {
		*(out++) = val;
	}
	code->cur = out;
}

void x86_shiftrot_irdisp(code_info *code, uint8_t op_ex, uint8_t val, uint8_t dst, int32_t disp, uint8_t size)
{
	check_alloc_code(code, 9);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (dst >= AH && dst <= BH) {
		dst -= (AH-X86_AH);
	}

	*(out++) = (val == 1 ? OP_SHIFTROT_1: OP_SHIFTROT_IR) | (size == SZ_B ? 0 : BIT_SIZE);
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst | (op_ex << 3);
	*(out++) = disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | dst | (op_ex << 3);
		*(out++) = disp;
		*(out++) = disp >> 8;
		*(out++) = disp >> 16;
		*(out++) = disp >> 24;
	}
	if (val != 1) {
		*(out++) = val;
	}
	code->cur = out;
}

void x86_shiftrot_clr(code_info *code, uint8_t op_ex, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 4);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || (size == SZ_B && dst >= RSP && dst <= RDI)) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (dst >= AH && dst <= BH) {
		dst -= (AH-X86_AH);
	}

	*(out++) = OP_SHIFTROT_CL | (size == SZ_B ? 0 : BIT_SIZE);
	*(out++) = MODE_REG_DIRECT | dst | (op_ex << 3);
	code->cur = out;
}

void x86_shiftrot_clrdisp(code_info *code, uint8_t op_ex, uint8_t dst, int32_t disp, uint8_t size)
{
	check_alloc_code(code, 8);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (dst >= AH && dst <= BH) {
		dst -= (AH-X86_AH);
	}

	*(out++) = OP_SHIFTROT_CL | (size == SZ_B ? 0 : BIT_SIZE);
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst | (op_ex << 3);
	*(out++) = disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | dst | (op_ex << 3);
		*(out++) = disp;
		*(out++) = disp >> 8;
		*(out++) = disp >> 16;
		*(out++) = disp >> 24;
}
	code->cur = out;
}

void rol_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	x86_shiftrot_ir(code, OP_EX_ROL, val, dst, size);
}

void ror_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	x86_shiftrot_ir(code, OP_EX_ROR, val, dst, size);
}

void rcl_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	x86_shiftrot_ir(code, OP_EX_RCL, val, dst, size);
}

void rcr_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	x86_shiftrot_ir(code, OP_EX_RCR, val, dst, size);
}

void shl_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	x86_shiftrot_ir(code, OP_EX_SHL, val, dst, size);
}

void shr_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	x86_shiftrot_ir(code, OP_EX_SHR, val, dst, size);
}

void sar_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	x86_shiftrot_ir(code, OP_EX_SAR, val, dst, size);
}

void rol_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_irdisp(code, OP_EX_ROL, val, dst_base, disp, size);
}

void ror_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_irdisp(code, OP_EX_ROR, val, dst_base, disp, size);
}

void rcl_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_irdisp(code, OP_EX_RCL, val, dst_base, disp, size);
}

void rcr_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_irdisp(code, OP_EX_RCR, val, dst_base, disp, size);
}

void shl_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_irdisp(code, OP_EX_SHL, val, dst_base, disp, size);
}

void shr_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_irdisp(code, OP_EX_SHR, val, dst_base, disp, size);
}

void sar_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_irdisp(code, OP_EX_SAR, val, dst_base, disp, size);
}

void rol_clr(code_info *code, uint8_t dst, uint8_t size)
{
	x86_shiftrot_clr(code, OP_EX_ROL, dst, size);
}

void ror_clr(code_info *code, uint8_t dst, uint8_t size)
{
	x86_shiftrot_clr(code, OP_EX_ROR, dst, size);
}

void rcl_clr(code_info *code, uint8_t dst, uint8_t size)
{
	x86_shiftrot_clr(code, OP_EX_RCL, dst, size);
}

void rcr_clr(code_info *code, uint8_t dst, uint8_t size)
{
	x86_shiftrot_clr(code, OP_EX_RCR, dst, size);
}

void shl_clr(code_info *code, uint8_t dst, uint8_t size)
{
	x86_shiftrot_clr(code, OP_EX_SHL, dst, size);
}

void shr_clr(code_info *code, uint8_t dst, uint8_t size)
{
	x86_shiftrot_clr(code, OP_EX_SHR, dst, size);
}

void sar_clr(code_info *code, uint8_t dst, uint8_t size)
{
	x86_shiftrot_clr(code, OP_EX_SAR, dst, size);
}

void rol_clrdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_clrdisp(code, OP_EX_ROL, dst_base, disp, size);
}

void ror_clrdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_clrdisp(code, OP_EX_ROR, dst_base, disp, size);
}

void rcl_clrdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_clrdisp(code, OP_EX_RCL, dst_base, disp, size);
}

void rcr_clrdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_clrdisp(code, OP_EX_RCR, dst_base, disp, size);
}

void shl_clrdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_clrdisp(code, OP_EX_SHL, dst_base, disp, size);
}

void shr_clrdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_clrdisp(code, OP_EX_SHR, dst_base, disp, size);
}

void sar_clrdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_shiftrot_clrdisp(code, OP_EX_SAR, dst_base, disp, size);
}

void add_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_ADD, src, dst, size);
}

void add_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_ADDI, OP_ADD, val, dst, size);
}

void add_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_ADDI, val, dst_base, disp, size);
}

void add_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_ADD, src, dst_base, disp, size, 0);
}

void add_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_ADD, dst, src_base, disp, size, BIT_DIR);
}

void adc_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_ADC, src, dst, size);
}

void adc_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_ADCI, OP_ADC, val, dst, size);
}

void adc_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_ADCI, val, dst_base, disp, size);
}

void adc_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_ADC, src, dst_base, disp, size, 0);
}

void adc_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_ADC, dst, src_base, disp, size, BIT_DIR);
}

void or_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_OR, src, dst, size);
}
void or_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_ORI, OP_OR, val, dst, size);
}

void or_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_ORI, val, dst_base, disp, size);
}

void or_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_OR, src, dst_base, disp, size, 0);
}

void or_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_OR, dst, src_base, disp, size, BIT_DIR);
}

void and_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_AND, src, dst, size);
}

void and_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_ANDI, OP_AND, val, dst, size);
}

void and_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_ANDI, val, dst_base, disp, size);
}

void and_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_AND, src, dst_base, disp, size, 0);
}

void and_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_AND, dst, src_base, disp, size, BIT_DIR);
}

void xor_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_XOR, src, dst, size);
}

void xor_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_XORI, OP_XOR, val, dst, size);
}

void xor_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_XORI, val, dst_base, disp, size);
}

void xor_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_XOR, src, dst_base, disp, size, 0);
}

void xor_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_XOR, dst, src_base, disp, size, BIT_DIR);
}

void sub_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_SUB, src, dst, size);
}

void sub_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_SUBI, OP_SUB, val, dst, size);
}

void sub_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_SUBI, val, dst_base, disp, size);
}

void sub_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_SUB, src, dst_base, disp, size, 0);
}

void sub_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_SUB, dst, src_base, disp, size, BIT_DIR);
}

void sbb_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_SBB, src, dst, size);
}

void sbb_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_SBBI, OP_SBB, val, dst, size);
}

void sbb_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_SBBI, val, dst_base, disp, size);
}

void sbb_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_SBB, src, dst_base, disp, size, 0);
}

void sbb_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_SBB, dst, src_base, disp, size, BIT_DIR);
}

void cmp_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_CMP, src, dst, size);
}

void cmp_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_IMMED_ARITH, OP_EX_CMPI, OP_CMP, val, dst, size);
}

void cmp_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_IMMED_ARITH, OP_EX_CMPI, val, dst_base, disp, size);
}

void cmp_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_CMP, src, dst_base, disp, size, 0);
}

void cmp_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_CMP, dst, src_base, disp, size, BIT_DIR);
}

void test_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_TEST, src, dst, size);
}

void test_ir(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	x86_ir(code, OP_NOT_NEG, OP_EX_TEST_I, OP_TEST, val, dst, size);
}

void test_irdisp(code_info *code, int32_t val, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_irdisp(code, OP_NOT_NEG, OP_EX_TEST_I, val, dst_base, disp, size);
}

void test_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_TEST, src, dst_base, disp, size, 0);
}

void test_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_TEST, dst, src_base, disp, size, BIT_DIR);
}

void imul_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP2_IMUL | (PRE_2BYTE << 8), dst, src, size);
}

void imul_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP2_IMUL | (PRE_2BYTE << 8), dst, src_base, disp, size, 0);
}

void not_r(code_info *code, uint8_t dst, uint8_t size)
{
	x86_r_size(code, OP_NOT_NEG, OP_EX_NOT, dst, size);
}

void neg_r(code_info *code, uint8_t dst, uint8_t size)
{
	x86_r_size(code, OP_NOT_NEG, OP_EX_NEG, dst, size);
}

void not_rdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rdisp_size(code, OP_NOT_NEG, OP_EX_NOT, dst_base, disp, size);
}

void neg_rdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rdisp_size(code, OP_NOT_NEG, OP_EX_NEG, dst_base, disp, size);
}

void mul_r(code_info *code, uint8_t dst, uint8_t size)
{
	x86_r_size(code, OP_NOT_NEG, OP_EX_MUL, dst, size);
}

void imul_r(code_info *code, uint8_t dst, uint8_t size)
{
	x86_r_size(code, OP_NOT_NEG, OP_EX_IMUL, dst, size);
}

void div_r(code_info *code, uint8_t dst, uint8_t size)
{
	x86_r_size(code, OP_NOT_NEG, OP_EX_DIV, dst, size);
}

void idiv_r(code_info *code, uint8_t dst, uint8_t size)
{
	x86_r_size(code, OP_NOT_NEG, OP_EX_IDIV, dst, size);
}

void mul_rdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rdisp_size(code, OP_NOT_NEG, OP_EX_MUL, dst_base, disp, size);
}

void imul_rdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rdisp_size(code, OP_NOT_NEG, OP_EX_IMUL, dst_base, disp, size);
}

void div_rdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rdisp_size(code, OP_NOT_NEG, OP_EX_DIV, dst_base, disp, size);
}

void idiv_rdisp(code_info *code, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rdisp_size(code, OP_NOT_NEG, OP_EX_IDIV, dst_base, disp, size);
}

void mov_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rr_sizedir(code, OP_MOV, src, dst, size);
}

void mov_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t disp, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_MOV, src, dst_base, disp, size, 0);
}

void mov_rdispr(code_info *code, uint8_t src_base, int32_t disp, uint8_t dst, uint8_t size)
{
	x86_rrdisp_sizedir(code, OP_MOV, dst, src_base, disp, size, BIT_DIR);
}

void mov_rrind(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rrind_sizedir(code, OP_MOV, src, dst, size, 0);
}

void mov_rindr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	x86_rrind_sizedir(code, OP_MOV, dst, src, size, BIT_DIR);
}

void mov_rrindex(code_info *code, uint8_t src, uint8_t dst_base, uint8_t dst_index, uint8_t scale, uint8_t size)
{
	x86_rrindex_sizedir(code, OP_MOV, src, dst_base, dst_index, scale, size, 0);
}

void mov_rindexr(code_info *code, uint8_t src_base, uint8_t src_index, uint8_t scale, uint8_t dst, uint8_t size)
{
	x86_rrindex_sizedir(code, OP_MOV, dst, src_base, src_index, scale, size, BIT_DIR);
}

void mov_ir(code_info *code, int64_t val, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 14);
	code_ptr out = code->cur;
	uint8_t sign_extend = 0;
	if (size == SZ_Q && val <= 0x7FFFFFFF && val >= -2147483648) {
		sign_extend = 1;
	}
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || (size == SZ_B && dst >= RSP && dst <= RDI)) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (dst >= AH && dst <= BH) {
		dst -= (AH-X86_AH);
	}
	if (size == SZ_B) {
		*(out++) = OP_MOV_I8R | dst;
	} else if (size == SZ_Q && sign_extend) {
		*(out++) = OP_MOV_IEA | BIT_SIZE;
		*(out++) = MODE_REG_DIRECT | dst;
	} else {
		*(out++) = OP_MOV_IR | dst;
	}
	*(out++) = val;
	if (size != SZ_B) {
		val >>= 8;
		*(out++) = val;
		if (size != SZ_W) {
			val >>= 8;
			*(out++) = val;
			val >>= 8;
			*(out++) = val;
			if (size == SZ_Q && !sign_extend) {
				val >>= 8;
				*(out++) = val;
				val >>= 8;
				*(out++) = val;
				val >>= 8;
				*(out++) = val;
				val >>= 8;
				*(out++) = val;
			}
		}
	}
	code->cur = out;
}

void mov_irdisp(code_info *code, int32_t val, uint8_t dst, int32_t disp, uint8_t size)
{
	check_alloc_code(code, 12);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (dst >= AH && dst <= BH) {
		dst -= (AH-X86_AH);
	}
	*(out++) = OP_MOV_IEA | (size == SZ_B ? 0 : BIT_SIZE);
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst;
	*(out++) = disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | dst;
		*(out++) = disp;
		*(out++) = disp >> 8;
		*(out++) = disp >> 16;
		*(out++) = disp >> 24;
	}

	*(out++) = val;
	if (size != SZ_B) {
		val >>= 8;
		*(out++) = val;
		if (size != SZ_W) {
			val >>= 8;
			*(out++) = val;
			val >>= 8;
			*(out++) = val;
		}
	}
	code->cur = out;
}

void mov_irind(code_info *code, int32_t val, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 8);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || (size == SZ_B && dst >= RSP && dst <= RDI)) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (dst >= AH && dst <= BH) {
		dst -= (AH-X86_AH);
	}
	*(out++) = OP_MOV_IEA | (size == SZ_B ? 0 : BIT_SIZE);
	*(out++) = MODE_REG_INDIRECT | dst;

	*(out++) = val;
	if (size != SZ_B) {
		val >>= 8;
		*(out++) = val;
		if (size != SZ_W) {
			val >>= 8;
			*(out++) = val;
			val >>= 8;
			*(out++) = val;
		}
	}
	code->cur = out;
}

void movsx_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t src_size, uint8_t size)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || src >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_RM_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst >= R8) {
			*out |= REX_REG_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (src_size == SZ_D) {
		*(out++) = OP_MOVSXD;
	} else {
		*(out++) = PRE_2BYTE;
		*(out++) = OP2_MOVSX | (src_size == SZ_B ? 0 : BIT_SIZE);
	}
	*(out++) = MODE_REG_DIRECT | src | (dst << 3);
	code->cur = out;
}

void movsx_rdispr(code_info *code, uint8_t src, int32_t disp, uint8_t dst, uint8_t src_size, uint8_t size)
{
	check_alloc_code(code, 12);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || src >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_RM_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst >= R8) {
			*out |= REX_REG_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	if (src_size == SZ_D) {
		*(out++) = OP_MOVSXD;
	} else {
		*(out++) = PRE_2BYTE;
		*(out++) = OP2_MOVSX | (src_size == SZ_B ? 0 : BIT_SIZE);
	}
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | src | (dst << 3);
	*(out++) = disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | src | (dst << 3);
		*(out++) = disp;
		*(out++) = disp >> 8;
		*(out++) = disp >> 16;
		*(out++) = disp >> 24;
	}
	code->cur = out;
}

void movzx_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t src_size, uint8_t size)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || src >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_RM_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst >= R8) {
			*out |= REX_REG_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = OP2_MOVZX | (src_size == SZ_B ? 0 : BIT_SIZE);
	*(out++) = MODE_REG_DIRECT | src | (dst << 3);
	code->cur = out;
}

void movzx_rdispr(code_info *code, uint8_t src, int32_t disp, uint8_t dst, uint8_t src_size, uint8_t size)
{
	check_alloc_code(code, 9);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8 || src >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_RM_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst >= R8) {
			*out |= REX_REG_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = OP2_MOVZX | (src_size == SZ_B ? 0 : BIT_SIZE);
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | src | (dst << 3);
	*(out++) = disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | src | (dst << 3);
		*(out++) = disp;
		*(out++) = disp >> 8;
		*(out++) = disp >> 16;
		*(out++) = disp >> 24;
	}
	code->cur = out;
}

void xchg_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 4);
	code_ptr out = code->cur;
	//TODO: Use OP_XCHG_AX when one of the registers is AX, EAX or RAX
	uint8_t tmp;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_B && dst >= RSP && dst <= RDI) {
		tmp = dst;
		dst = src;
		src = tmp;
	}
	if (size == SZ_Q || src >= R8 || dst >= R8 || (size == SZ_B && src >= RSP && src <= RDI)) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_REG_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	uint8_t opcode = OP_XCHG;
	if (size == SZ_B) {
		if (src >= AH && src <= BH) {
			src -= (AH-X86_AH);
		}
		if (dst >= AH && dst <= BH) {
			dst -= (AH-X86_AH);
		}
	} else {
		opcode |= BIT_SIZE;
	}
	*(out++) = opcode;
	*(out++) = MODE_REG_DIRECT | dst | (src << 3);
	code->cur = out;
}

void pushf(code_info *code)
{
	check_alloc_code(code, 1);
	code_ptr out = code->cur;
	*(out++) = OP_PUSHF;
	code->cur = out;
}

void popf(code_info *code)
{
	check_alloc_code(code, 1);
	code_ptr out = code->cur;
	*(out++) = OP_POPF;
	code->cur = out;
}

void push_r(code_info *code, uint8_t reg)
{
	check_alloc_code(code, 2);
	code_ptr out = code->cur;
	if (reg >= R8) {
		*(out++) = PRE_REX | REX_RM_FIELD;
		reg -= R8 - X86_R8;
	}
	*(out++) = OP_PUSH | reg;
	code->cur = out;
	code->stack_off += sizeof(void *);
}

void push_rdisp(code_info *code, uint8_t base, int32_t disp)
{
	//This instruction has no explicit size, so we pass SZ_B
	//to avoid any prefixes or bits being set
	x86_rdisp_size(code, OP_SINGLE_EA, OP_EX_PUSH_EA, base, disp, SZ_B);
	code->stack_off += sizeof(void *);
}

void pop_r(code_info *code, uint8_t reg)
{
	check_alloc_code(code, 2);
	code_ptr out = code->cur;
	if (reg >= R8) {
		*(out++) = PRE_REX | REX_RM_FIELD;
		reg -= R8 - X86_R8;
	}
	*(out++) = OP_POP | reg;
	code->cur = out;
	code->stack_off -= sizeof(void *);
}

void pop_rind(code_info *code, uint8_t reg)
{
	check_alloc_code(code, 3);
	code_ptr out = code->cur;
	if (reg >= R8) {
		*(out++) = PRE_REX | REX_RM_FIELD;
		reg -= R8 - X86_R8;
	}
	*(out++) = PRE_XOP;
	*(out++) = MODE_REG_INDIRECT | reg;
	code->cur = out;
	code->stack_off -=  sizeof(void *);
}

void setcc_r(code_info *code, uint8_t cc, uint8_t dst)
{
	check_alloc_code(code, 4);
	code_ptr out = code->cur;
	if (dst >= R8) {
		*(out++) = PRE_REX | REX_RM_FIELD;
		dst -= R8 - X86_R8;
	} else if (dst >= RSP && dst <= RDI) {
		*(out++) = PRE_REX;
	} else if (dst >= AH && dst <= BH) {
		dst -= AH - X86_AH;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = OP2_SETCC | cc;
	*(out++) = MODE_REG_DIRECT | dst;
	code->cur = out;
}

void setcc_rind(code_info *code, uint8_t cc, uint8_t dst)
{
	check_alloc_code(code, 4);
	code_ptr out = code->cur;
	if (dst >= R8) {
		*(out++) = PRE_REX | REX_RM_FIELD;
		dst -= R8 - X86_R8;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = OP2_SETCC | cc;
	*(out++) = MODE_REG_INDIRECT | dst;
	code->cur = out;
}

void setcc_rdisp(code_info *code, uint8_t cc, uint8_t dst, int32_t disp)
{
	check_alloc_code(code, 8);
	code_ptr out = code->cur;
	if (dst >= R8) {
		*(out++) = PRE_REX | REX_RM_FIELD;
		dst -= R8 - X86_R8;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = OP2_SETCC | cc;
	if (disp < 128 && disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst;
	*(out++) = disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | dst;
		*(out++) = disp;
		*(out++) = disp >> 8;
		*(out++) = disp >> 16;
		*(out++) = disp >> 24;
	}
	code->cur = out;
}

void bit_rr(code_info *code, uint8_t op2, uint8_t src, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || src >= R8 || dst >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_REG_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = op2;
	*(out++) = MODE_REG_DIRECT | dst | (src << 3);
	code->cur = out;
}

void bit_rrdisp(code_info *code, uint8_t op2, uint8_t src, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	check_alloc_code(code, 9);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || src >= R8 || dst_base >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (src >= R8) {
			*out |= REX_REG_FIELD;
			src -= (R8 - X86_R8);
		}
		if (dst_base >= R8) {
			*out |= REX_RM_FIELD;
			dst_base -= (R8 - X86_R8);
		}
		out++;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = op2;
	if (dst_disp < 128 && dst_disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst_base | (src << 3);
	*(out++) = dst_disp;
	} else {
	*(out++) = MODE_REG_DISPLACE32 | dst_base | (src << 3);
	*(out++) = dst_disp;
	*(out++) = dst_disp >> 8;
	*(out++) = dst_disp >> 16;
	*(out++) = dst_disp >> 24;
	}
	code->cur = out;
}

void bit_ir(code_info *code, uint8_t op_ex, uint8_t val, uint8_t dst, uint8_t size)
{
	check_alloc_code(code, 6);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst >= R8) {
			*out |= REX_RM_FIELD;
			dst -= (R8 - X86_R8);
		}
		out++;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = OP2_BTX_I;
	*(out++) = MODE_REG_DIRECT | dst | (op_ex << 3);
	*(out++) = val;
	code->cur = out;
}

void bit_irdisp(code_info *code, uint8_t op_ex, uint8_t val, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	check_alloc_code(code, 10);
	code_ptr out = code->cur;
	if (size == SZ_W) {
		*(out++) = PRE_SIZE;
	}
	if (size == SZ_Q || dst_base >= R8) {
		*out = PRE_REX;
		if (size == SZ_Q) {
			*out |= REX_QUAD;
		}
		if (dst_base >= R8) {
			*out |= REX_RM_FIELD;
			dst_base -= (R8 - X86_R8);
		}
		out++;
	}
	*(out++) = PRE_2BYTE;
	*(out++) = OP2_BTX_I;
	if (dst_disp < 128 && dst_disp >= -128) {
	*(out++) = MODE_REG_DISPLACE8 | dst_base | (op_ex << 3);
	*(out++) = dst_disp;
	} else {
		*(out++) = MODE_REG_DISPLACE32 | dst_base | (op_ex << 3);
		*(out++) = dst_disp;
		*(out++) = dst_disp >> 8;
		*(out++) = dst_disp >> 16;
		*(out++) = dst_disp >> 24;
	}
	*(out++) = val;
	code->cur = out;
}

void bt_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	return bit_rr(code, OP2_BT, src, dst, size);
}

void bt_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_rrdisp(code, OP2_BT, src, dst_base, dst_disp, size);
}

void bt_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	return bit_ir(code, OP_EX_BT, val, dst, size);
}

void bt_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_irdisp(code, OP_EX_BT, val, dst_base, dst_disp, size);
}

void bts_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	return bit_rr(code, OP2_BTS, src, dst, size);
}

void bts_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_rrdisp(code, OP2_BTS, src, dst_base, dst_disp, size);
}

void bts_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	return bit_ir(code, OP_EX_BTS, val, dst, size);
}

void bts_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_irdisp(code, OP_EX_BTS, val, dst_base, dst_disp, size);
}

void btr_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	return bit_rr(code, OP2_BTR, src, dst, size);
}

void btr_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_rrdisp(code, OP2_BTR, src, dst_base, dst_disp, size);
}

void btr_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	return bit_ir(code, OP_EX_BTR, val, dst, size);
}

void btr_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_irdisp(code, OP_EX_BTR, val, dst_base, dst_disp, size);
}

void btc_rr(code_info *code, uint8_t src, uint8_t dst, uint8_t size)
{
	return bit_rr(code, OP2_BTC, src, dst, size);
}

void btc_rrdisp(code_info *code, uint8_t src, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_rrdisp(code, OP2_BTC, src, dst_base, dst_disp, size);
}

void btc_ir(code_info *code, uint8_t val, uint8_t dst, uint8_t size)
{
	return bit_ir(code, OP_EX_BTC, val, dst, size);
}

void btc_irdisp(code_info *code, uint8_t val, uint8_t dst_base, int32_t dst_disp, uint8_t size)
{
	return bit_irdisp(code, OP_EX_BTC, val, dst_base, dst_disp, size);
}

void jcc(code_info *code, uint8_t cc, code_ptr dest)
{
	check_alloc_code(code, 6);
	code_ptr out = code->cur;
	ptrdiff_t disp = dest-(out+2);
	if (disp <= 0x7F && disp >= -0x80) {
		*(out++) = OP_JCC | cc;
		*(out++) = disp;
	} else {
		disp = dest-(out+6);
		if (disp <= 0x7FFFFFFF && disp >= -2147483648) {
			*(out++) = PRE_2BYTE;
			*(out++) = OP2_JCC | cc;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
		} else {
			fatal_error("jcc: %p - %p = %lX which is out of range for a 32-bit displacement\n", dest, out + 6, (long)disp);
		}
	}
	code->cur = out;
}

void jmp(code_info *code, code_ptr dest)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	ptrdiff_t disp = dest-(out+2);
	if (disp <= 0x7F && disp >= -0x80) {
		*(out++) = OP_JMP_BYTE;
		*(out++) = disp;
	} else {
		disp = dest-(out+5);
		if (disp <= 0x7FFFFFFF && disp >= -2147483648) {
			*(out++) = OP_JMP;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
			disp >>= 8;
			*(out++) = disp;
		} else {
			fatal_error("jmp: %p - %p = %lX which is out of range for a 32-bit displacement\n", dest, out + 6, (long)disp);
		}
	}
	code->cur = out;
}

void jmp_r(code_info *code, uint8_t dst)
{
	check_alloc_code(code, 3);
	code_ptr out = code->cur;
	if (dst >= R8) {
		dst -= R8 - X86_R8;
		*(out++) = PRE_REX | REX_RM_FIELD;
	}
	*(out++) = OP_SINGLE_EA;
	*(out++) = MODE_REG_DIRECT | dst | (OP_EX_JMP_EA << 3);
	code->cur = out;
}

void jmp_rind(code_info *code, uint8_t dst)
{
	check_alloc_code(code, 3);
	code_ptr out = code->cur;
	if (dst >= R8) {
		dst -= R8 - X86_R8;
		*(out++) = PRE_REX | REX_RM_FIELD;
	}
	*(out++) = OP_SINGLE_EA;
	*(out++) = MODE_REG_INDIRECT | dst | (OP_EX_JMP_EA << 3);
	code->cur = out;
}

void call_noalign(code_info *code, code_ptr fun)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	ptrdiff_t disp = fun-(out+5);
	if (disp <= 0x7FFFFFFF && disp >= -2147483648) {
		*(out++) = OP_CALL;
		*(out++) = disp;
		disp >>= 8;
		*(out++) = disp;
		disp >>= 8;
		*(out++) = disp;
		disp >>= 8;
		*(out++) = disp;
	} else {
		//TODO: Implement far call???
		fatal_error("call: %p - %p = %lX which is out of range for a 32-bit displacement\n", fun, out + 5, (long)disp);
	}
	code->cur = out;
}


void call(code_info *code, code_ptr fun)
{
	code->stack_off += sizeof(void *);
	int32_t adjust = 0;
	if (code->stack_off & 0xF) {
		adjust = 16 - (code->stack_off & 0xF);
		code->stack_off += adjust;
		sub_ir(code, adjust, RSP, SZ_PTR);
	}
	call_noalign(code, fun);
	if (adjust) {
		add_ir(code, adjust, RSP, SZ_PTR);
	}
	code->stack_off -= sizeof(void *) + adjust;
}
void call_raxfallback(code_info *code, code_ptr fun)
{
	check_alloc_code(code, 5);
	code_ptr out = code->cur;
	ptrdiff_t disp = fun-(out+5);
	if (disp <= 0x7FFFFFFF && disp >= -2147483648) {
		*(out++) = OP_CALL;
		*(out++) = disp;
		disp >>= 8;
		*(out++) = disp;
		disp >>= 8;
		*(out++) = disp;
		disp >>= 8;
		*(out++) = disp;
	} else {
		mov_ir(code, (int64_t)fun, RAX, SZ_PTR);
		call_r(code, RAX);
	}
	code->cur = out;
}

void call_r(code_info *code, uint8_t dst)
{
	code->stack_off += sizeof(void *);
	int32_t adjust = 0;
	if (code->stack_off & 0xF) {
		adjust = 16 - (code->stack_off & 0xF);
		code->stack_off += adjust;
		sub_ir(code, adjust, RSP, SZ_PTR);
	}
	check_alloc_code(code, 2);
	code_ptr out = code->cur;
	*(out++) = OP_SINGLE_EA;
	*(out++) = MODE_REG_DIRECT | dst | (OP_EX_CALL_EA << 3);
	code->cur = out;
	if (adjust) {
		add_ir(code, adjust, RSP, SZ_PTR);
	}
	code->stack_off -= sizeof(void *) + adjust;
}

void retn(code_info *code)
{
	check_alloc_code(code, 1);
	code_ptr out = code->cur;
	*(out++) = OP_RETN;
	code->cur = out;
}

void cdq(code_info *code)
{
	check_alloc_code(code, 1);
	code_ptr out = code->cur;
	*(out++) = OP_CDQ;
	code->cur = out;
}

void loop(code_info *code, code_ptr dst)
{
	check_alloc_code(code, 2);
	code_ptr out = code->cur;
	ptrdiff_t disp = dst-(out+2);
	*(out++) = OP_LOOP;
	*(out++) = disp;
	code->cur = out;
}

uint32_t prep_args(code_info *code, uint32_t num_args, va_list args)
{
	uint8_t *arg_arr = malloc(num_args);
	for (int i = 0; i < num_args; i ++)
	{
		arg_arr[i] = va_arg(args, int);
	}
#ifdef X86_64
	uint32_t stack_args = 0;
	uint8_t abi_regs[] = {RDI, RSI, RDX, RCX, R8, R9};
	int8_t reg_swap[R15+1];
	uint32_t usage = 0;
	memset(reg_swap, -1, sizeof(reg_swap));
	for (int i = 0; i < num_args; i ++)
	{
		usage |= 1 << arg_arr[i];
	}
	for (int i = 0; i < num_args; i ++)
	{
		uint8_t reg_arg = arg_arr[i];
		if (i < sizeof(abi_regs)) {
			if (reg_swap[reg_arg] >= 0) {
				reg_arg = reg_swap[reg_arg];
			}
			if (reg_arg != abi_regs[i]) {
				if (usage & (1 << abi_regs[i])) {
					xchg_rr(code, reg_arg, abi_regs[i], SZ_PTR);
					reg_swap[abi_regs[i]] = reg_arg;
				} else {
					mov_rr(code, reg_arg, abi_regs[i], SZ_PTR);
				}
			}
		} else {
			arg_arr[stack_args++] = reg_arg;
		}
	}
#else
#define stack_args num_args
#endif
	uint32_t stack_off_call = code->stack_off + sizeof(void *) * (stack_args + 1);
	uint32_t adjust = 0;
	if (stack_off_call & 0xF) {
		adjust = 16 - (stack_off_call & 0xF);
		sub_ir(code, adjust, RSP, SZ_PTR);
		code->stack_off += adjust;
	}
	for (int i = stack_args -1; i >= 0; i--)
	{
		push_r(code, arg_arr[i]);
	}
	
	return stack_args * sizeof(void *) + adjust;
}

void call_args(code_info *code, code_ptr fun, uint32_t num_args, ...)
{
	va_list args;
	va_start(args, num_args);
	uint32_t adjust = prep_args(code, num_args, args);
	va_end(args);
	call_raxfallback(code, fun);
	if (adjust) {
		add_ir(code, adjust, RSP, SZ_PTR);
		code->stack_off -= adjust;
	}
}
/*
void call_args_abi(code_info *code, code_ptr fun, uint32_t num_args, ...)
{
	va_list args;
	va_start(args, num_args);
	uint32_t adjust = prep_args(code, num_args, args);
	va_end(args);
#ifdef X86_64
	test_ir(code, 8, RSP, SZ_PTR); //check stack alignment
	code_ptr do_adjust_rsp = code->cur + 1;
	jcc(code, CC_NZ, code->cur + 2);
#endif
	call_raxfallback(code, fun);
	if (adjust) {
		add_ir(code, adjust, RSP, SZ_PTR);
	}
#ifdef X86_64
	code_ptr no_adjust_rsp = code->cur + 1;
	jmp(code, code->cur + 2);
	*do_adjust_rsp = code->cur - (do_adjust_rsp+1);
	sub_ir(code, 8, RSP, SZ_PTR);
	call_raxfallback(code, fun);
	add_ir(code, adjust + 8 , RSP, SZ_PTR);
	*no_adjust_rsp = code->cur - (no_adjust_rsp+1);
#endif
}
*/
void save_callee_save_regs(code_info *code)
{
	push_r(code, RBX);
	push_r(code, RBP);
#ifdef X86_64
	push_r(code, R12);
	push_r(code, R13);
	push_r(code, R14);
	push_r(code, R15);
#else
	push_r(code, RDI);
	push_r(code, RSI);
#endif
}

void restore_callee_save_regs(code_info *code)
{
#ifdef X86_64
	pop_r(code, R15);
	pop_r(code, R14);
	pop_r(code, R13);
	pop_r(code, R12);
#else
	pop_r(code, RSI);
	pop_r(code, RDI);
#endif
	pop_r(code, RBP);
	pop_r(code, RBX);
}

uint8_t has_modrm(uint8_t prefix, uint8_t opcode)
{
	if (!prefix) {
		switch (opcode)
		{
		case OP_JMP:
		case OP_JMP_BYTE:
		case OP_JCC:
		case OP_CALL:
		case OP_RETN:
		case OP_LOOP:
		case OP_MOV_I8R:
		case OP_MOV_IR:
		case OP_PUSHF:
		case OP_POPF:
		case OP_PUSH:
		case OP_POP:
		case OP_CDQ:
			return 0;
		}
	} else if (prefix == PRE_2BYTE) {
		switch (opcode)
		{
		case OP2_JCC:
			return 0;
		}
	}
	return 1;
}

uint8_t has_sib(uint8_t mod_rm)
{
	uint8_t mode = mod_rm & 0xC0;
	uint8_t rm = mod_rm & 3;

	return mode != MODE_REG_DIRECT && rm == RSP;
}

uint32_t x86_inst_size(code_ptr start)
{
	code_ptr code = start;
	uint8_t cont = 1;
	uint8_t prefix = 0;
	uint8_t op_size = SZ_B;
	uint8_t main_op;

	while (cont)
	{
		if (*code == PRE_SIZE) {
			op_size = SZ_W;
		} else if (*code == PRE_REX) {
			if (*code & REX_QUAD) {
				op_size = SZ_Q;
			}
		} else if(*code == PRE_2BYTE || *code == PRE_XOP) {
			prefix = *code;
		} else {
			main_op = *code;
			cont = 0;
		}
		code++;
	}
	if (has_modrm(prefix, main_op)) {
		uint8_t mod_rm = *(code++);
		if (has_sib(mod_rm)) {
			//sib takes up a byte, but can't add any additional ones beyond that
			code++;
		}
		uint8_t mode = mod_rm & 0xC0;
		uint8_t rm = mod_rm & 3;
		if (mode == MODE_REG_DISPLACE8) {
			code++;
		} else if (mode == MODE_REG_DISPLACE32 || (mode == MODE_REG_INDIRECT && rm == RBP)) {
			code += 4;
		}
	} else {
	}

	return code-start;
}