changeset 2350:f8b5142c06aa

Allow 68K to return mid-instruction. Adjust how 68K interrupt ack works so int2 busy flag timing is more correct. Fix some other SCD timing issues
author Michael Pavone <pavone@retrodev.com>
date Mon, 16 Oct 2023 23:30:04 -0700
parents f0fc6c09517d
children 8f3cfb77f1e3
files genesis.c genesis.h lc8951.c lc8951.h m68k_core.c m68k_core.h m68k_core_x86.c segacd.c
diffstat 8 files changed, 241 insertions(+), 120 deletions(-) [+]
line wrap: on
line diff
--- a/genesis.c	Fri Oct 13 22:44:36 2023 -0700
+++ b/genesis.c	Mon Oct 16 23:30:04 2023 -0700
@@ -471,6 +471,49 @@
 #define REFRESH_INTERVAL 128
 #define REFRESH_DELAY 2
 
+void gen_update_refresh(m68k_context *context)
+{
+	uint32_t interval = MCLKS_PER_68K * REFRESH_INTERVAL;
+	genesis_context *gen = context->system;
+	gen->refresh_counter += context->current_cycle - gen->last_sync_cycle;
+	gen->last_sync_cycle = context->current_cycle;
+	context->current_cycle += REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / interval);
+	gen->refresh_counter = gen->refresh_counter % interval;
+}
+
+void gen_update_refresh_free_access(m68k_context *context)
+{
+	genesis_context *gen = context->system;
+	uint32_t before = context->current_cycle - 4*MCLKS_PER_68K;
+	if (before < gen->last_sync_cycle) {
+		return;
+	}
+	//Add refresh delays for any accesses that happened beofre the current one
+	gen->refresh_counter += before - gen->last_sync_cycle;
+	uint32_t interval = MCLKS_PER_68K * REFRESH_INTERVAL;
+	uint32_t delay = REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / interval);
+	if (delay) {
+		//To avoid the extra cycles being absorbed in the refresh free update below, we need to update again
+		gen->refresh_counter = gen->refresh_counter % interval;
+		gen->refresh_counter += delay;
+		delay += REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / interval);
+		context->current_cycle += delay;
+	}
+	gen->last_sync_cycle = context->current_cycle;
+	//advance refresh counter for the current access, but don't generate delays
+	gen->refresh_counter += 4*MCLKS_PER_68K;
+	gen->refresh_counter = gen->refresh_counter % interval;
+}
+
+void gen_update_refresh_no_wait(m68k_context *context)
+{
+	uint32_t interval = MCLKS_PER_68K * REFRESH_INTERVAL;
+	genesis_context *gen = context->system;
+	gen->refresh_counter += context->current_cycle - gen->last_sync_cycle;
+	gen->last_sync_cycle = context->current_cycle;
+	gen->refresh_counter = gen->refresh_counter % interval;
+}
+
 #include <limits.h>
 #define ADJUST_BUFFER (8*MCLKS_LINE*313)
 #define MAX_NO_ADJUST (UINT_MAX-ADJUST_BUFFER)
@@ -480,12 +523,11 @@
 	genesis_context * gen = context->system;
 	vdp_context * v_context = gen->vdp;
 	z80_context * z_context = gen->z80;
-	//lame estimation of refresh cycle delay
-	gen->refresh_counter += context->current_cycle - gen->last_sync_cycle;
-	if (!gen->bus_busy) {
-		context->current_cycle += REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / (MCLKS_PER_68K * REFRESH_INTERVAL));
+	if (gen->bus_busy) {
+		gen_update_refresh_no_wait(context);
+	} else {
+		gen_update_refresh(context);
 	}
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
 
 	uint32_t mclks = context->current_cycle;
 	sync_z80(gen, mclks);
@@ -561,11 +603,6 @@
 	gen->frame_end = vdp_cycles_to_frame_end(v_context);
 	context->sync_cycle = gen->frame_end;
 	//printf("Set sync cycle to: %d @ %d, vcounter: %d, hslot: %d\n", context->sync_cycle, context->current_cycle, v_context->vcounter, v_context->hslot);
-	if (context->int_ack) {
-		//printf("acknowledging %d @ %d:%d, vcounter: %d, hslot: %d\n", context->int_ack, context->current_cycle, v_context->cycles, v_context->vcounter, v_context->hslot);
-		vdp_int_ack(v_context);
-		context->int_ack = 0;
-	}
 	if (!address && (gen->header.enter_debugger || gen->header.save_state)) {
 		context->sync_cycle = context->current_cycle + 1;
 	}
@@ -627,7 +664,26 @@
 			context->sync_cycle = context->current_cycle + 1;
 		}
 	}
-	gen->last_sync_cycle = context->current_cycle;
+	return context;
+}
+
+static m68k_context *int_ack(m68k_context *context)
+{
+	genesis_context * gen = context->system;
+	vdp_context * v_context = gen->vdp;
+	//printf("acknowledging %d @ %d:%d, vcounter: %d, hslot: %d\n", context->int_ack, context->current_cycle, v_context->cycles, v_context->vcounter, v_context->hslot);
+	vdp_run_context(v_context, context->current_cycle);
+	vdp_int_ack(v_context);
+	
+	//the Genesis responds to these exclusively with !VPA which means its a slow
+	//6800 operation. documentation says these can take between 10 and 19 cycles.
+	//actual results measurements seem to suggest it's actually between 9 and 18
+	//Base 68K core has added 4 cycles for a normal int ack cycle already
+	//We add 5 + the current cycle count (in 68K cycles) mod 10 to simulate the
+	//additional variable delay from the use of the 6800 cycle
+	uint32_t cycle_count = context->current_cycle / context->options->gen.clock_divider;
+	context->current_cycle += 5 + (cycle_count % 10);
+	
 	return context;
 }
 
@@ -644,10 +700,7 @@
 	//printf("vdp_port write: %X, value: %X, cycle: %d\n", vdp_port, value, context->current_cycle);
 
 	//do refresh check here so we can avoid adding a penalty for a refresh that happens during a VDP access
-	gen->refresh_counter += context->current_cycle - 4*MCLKS_PER_68K - gen->last_sync_cycle;
-	context->current_cycle += REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / (MCLKS_PER_68K * REFRESH_INTERVAL));
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
-	gen->last_sync_cycle = context->current_cycle;
+	gen_update_refresh_free_access(context);
 
 	sync_components(context, 0);
 	vdp_context *v_context = gen->vdp;
@@ -726,16 +779,14 @@
 		vdp_test_port_write(gen->vdp, value);
 	}
 
-	gen->last_sync_cycle -= 4 * MCLKS_PER_68K;
 	//refresh may have happened while we were waiting on the VDP,
 	//so advance refresh_counter but don't add any delays
 	if (vdp_port >= 4 && vdp_port < 8 && v_context->cycles != before_cycle) {
 		gen->refresh_counter = 0;
+		gen->last_sync_cycle = context->current_cycle;
 	} else {
-		gen->refresh_counter += (context->current_cycle - gen->last_sync_cycle);
-		gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
+		gen_update_refresh_no_wait(context);
 	}
-	gen->last_sync_cycle = context->current_cycle;
 	return context;
 }
 
@@ -785,10 +836,7 @@
 	uint16_t value;
 
 	//do refresh check here so we can avoid adding a penalty for a refresh that happens during a VDP access
-	gen->refresh_counter += context->current_cycle - 4*MCLKS_PER_68K - gen->last_sync_cycle;
-	context->current_cycle += REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / (MCLKS_PER_68K * REFRESH_INTERVAL));
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
-	gen->last_sync_cycle = context->current_cycle;
+	gen_update_refresh_free_access(context);
 
 	sync_components(context, 0);
 	vdp_context * v_context = gen->vdp;
@@ -816,12 +864,9 @@
 		gen->bus_busy = 0;
 	}
 
-	gen->last_sync_cycle -= 4 * MCLKS_PER_68K;
 	//refresh may have happened while we were waiting on the VDP,
 	//so advance refresh_counter but don't add any delays
-	gen->refresh_counter += (context->current_cycle - gen->last_sync_cycle);
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
-	gen->last_sync_cycle = context->current_cycle;
+	gen_update_refresh_no_wait(context);
 	return value;
 }
 
@@ -882,10 +927,7 @@
 	genesis_context * gen = context->system;
 
 	//do refresh check here so we can avoid adding a penalty for a refresh that happens during an IO area access
-	gen->refresh_counter += context->current_cycle - 4*MCLKS_PER_68K - gen->last_sync_cycle;
-	context->current_cycle += REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / (MCLKS_PER_68K * REFRESH_INTERVAL));
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
-	gen->last_sync_cycle = context->current_cycle - 4*MCLKS_PER_68K;
+	gen_update_refresh_free_access(context);
 
 	if (location < 0x10000) {
 		//Access to Z80 memory incurs a one 68K cycle wait state
@@ -1014,8 +1056,7 @@
 	}
 
 	//no refresh delays during IO access
-	gen->refresh_counter += context->current_cycle - gen->last_sync_cycle;
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
+	gen_update_refresh_no_wait(context);
 	return context;
 }
 
@@ -1041,10 +1082,7 @@
 	genesis_context *gen = context->system;
 
 	//do refresh check here so we can avoid adding a penalty for a refresh that happens during an IO area access
-	gen->refresh_counter += context->current_cycle - 4*MCLKS_PER_68K - gen->last_sync_cycle;
-	context->current_cycle += REFRESH_DELAY * MCLKS_PER_68K * (gen->refresh_counter / (MCLKS_PER_68K * REFRESH_INTERVAL));
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
-	gen->last_sync_cycle = context->current_cycle - 4*MCLKS_PER_68K;
+	gen_update_refresh_free_access(context);
 
 	if (location < 0x10000) {
 		//Access to Z80 memory incurs a one 68K cycle wait state
@@ -1143,9 +1181,7 @@
 	}
 
 	//no refresh delays during IO access
-	gen->refresh_counter += context->current_cycle - gen->last_sync_cycle;
-	gen->refresh_counter = gen->refresh_counter % (MCLKS_PER_68K * REFRESH_INTERVAL);
-	gen->last_sync_cycle = context->current_cycle;
+	gen_update_refresh_no_wait(context);
 	return value;
 }
 
@@ -2327,7 +2363,7 @@
 	info.map = gen->header.info.map = NULL;
 
 	m68k_options *opts = malloc(sizeof(m68k_options));
-	init_m68k_opts(opts, map, map_chunks, MCLKS_PER_68K, sync_components);
+	init_m68k_opts(opts, map, map_chunks, MCLKS_PER_68K, sync_components, int_ack);
 	if (!strcmp(tern_find_ptr_default(model, "tas", "broken"), "broken")) {
 		opts->gen.flags |= M68K_OPT_BROKEN_READ_MODIFY;
 	}
@@ -2400,7 +2436,7 @@
 	uint32_t num_chunks = cd_chunks + base_chunks;
 
 	m68k_options *opts = malloc(sizeof(m68k_options));
-	init_m68k_opts(opts, map, num_chunks, MCLKS_PER_68K, sync_components);
+	init_m68k_opts(opts, map, num_chunks, MCLKS_PER_68K, sync_components, int_ack);
 	//TODO: make this configurable
 	opts->gen.flags |= M68K_OPT_BROKEN_READ_MODIFY;
 	gen->m68k = init_68k_context(opts, NULL);
--- a/genesis.h	Fri Oct 13 22:44:36 2023 -0700
+++ b/genesis.h	Mon Oct 16 23:30:04 2023 -0700
@@ -88,6 +88,7 @@
 genesis_context *alloc_config_genesis_cdboot(system_media *media, uint32_t system_opts, uint8_t force_region);
 void genesis_serialize(genesis_context *gen, serialize_buffer *buf, uint32_t m68k_pc, uint8_t all);
 void genesis_deserialize(deserialize_buffer *buf, genesis_context *gen);
+void gen_update_refresh_free_access(m68k_context *context);
 
 #endif //GENESIS_H_
 
--- a/lc8951.c	Fri Oct 13 22:44:36 2023 -0700
+++ b/lc8951.c	Mon Oct 16 23:30:04 2023 -0700
@@ -293,20 +293,12 @@
 	}
 }
 
-void lc8951_resume_transfer(lc8951 *context, uint32_t cycle)
+void lc8951_resume_transfer(lc8951 *context)
 {
 	if (context->triggered && context->transfer_end == CYCLE_NEVER && (context->ifctrl & BIT_DOUTEN)) {
 		uint16_t transfer_size = context->regs[DBCL] | (context->regs[DBCH] << 8);
-		//HACK!!! Work around Sub CPU running longer than we would like and dragging other components with it
-		uint32_t step_diff = (context->cycle - cycle) / context->clock_step;
-		if (step_diff) {
-			context->cycle -= step_diff * context->clock_step;
-		}
 		context->transfer_end = context->cycle + transfer_size * context->cycles_per_byte;
 		context->next_byte_cycle = context->cycle;
-		if (step_diff) {
-			lc8951_run(context, cycle);
-		}
 	}
 }
 
--- a/lc8951.h	Fri Oct 13 22:44:36 2023 -0700
+++ b/lc8951.h	Mon Oct 16 23:30:04 2023 -0700
@@ -43,7 +43,7 @@
 void lc8951_ar_write(lc8951 *context, uint8_t value);
 void lc8951_write_byte(lc8951 *context, uint32_t cycle, int sector_offset, uint8_t byte);
 uint32_t lc8951_next_interrupt(lc8951 *context);
-void lc8951_resume_transfer(lc8951 *context, uint32_t cycle);
+void lc8951_resume_transfer(lc8951 *context);
 void lc8951_adjust_cycles(lc8951 *context, uint32_t deduction);
 void lc8951_serialize(lc8951 *context, serialize_buffer *buf);
 void lc8951_deserialize(deserialize_buffer *buf, void *vcontext);
--- a/m68k_core.c	Fri Oct 13 22:44:36 2023 -0700
+++ b/m68k_core.c	Mon Oct 16 23:30:04 2023 -0700
@@ -1206,7 +1206,9 @@
 void resume_68k(m68k_context *context)
 {
 	code_ptr addr = context->resume_pc;
-	context->resume_pc = NULL;
+	if (!context->stack_storage_count) {
+		context->resume_pc = NULL;
+	}
 	m68k_options * options = context->options;
 	context->should_return = 0;
 	options->start_context(addr, context);
@@ -1220,6 +1222,8 @@
 		//switching from user to system mode so swap stack pointers
 		context->aregs[8] = context->aregs[7];
 	}
+	context->resume_pc = NULL;
+	context->stack_storage_count = 0;
 	context->status = 0x27;
 	context->aregs[7] = ((uint32_t)reset_vec[0]) << 16 | reset_vec[1];
 	uint32_t address = ((uint32_t)reset_vec[2]) << 16 | reset_vec[3];
--- a/m68k_core.h	Fri Oct 13 22:44:36 2023 -0700
+++ b/m68k_core.h	Mon Oct 16 23:30:04 2023 -0700
@@ -27,6 +27,7 @@
 typedef void (*start_fun)(uint8_t * addr, void * context);
 typedef struct m68k_context m68k_context;
 typedef m68k_context *(*sync_fun)(m68k_context * context, uint32_t address);
+typedef m68k_context *(*int_ack_fun)(m68k_context * context);
 
 typedef struct {
 	code_ptr impl;
@@ -61,7 +62,10 @@
 	code_ptr		set_sr;
 	code_ptr		set_ccr;
 	code_ptr        bp_stub;
+	code_ptr        save_context_scratch;
+	code_ptr        load_context_scratch;
 	sync_fun        sync_components;
+	int_ack_fun     int_ack;
 	code_info       extra_code;
 	movem_fun       *big_movem;
 	uint32_t        num_movem;
@@ -79,7 +83,6 @@
 struct m68k_context {
 	uint8_t         flags[5];
 	uint8_t         status;
-	uint16_t        int_ack;
 	uint32_t        dregs[8];
 	uint32_t        aregs[9];
 	uint32_t		target_cycle; //cycle at which the next synchronization or interrupt occurs
@@ -88,17 +91,22 @@
 	uint32_t        int_cycle;
 	uint32_t        int_num;
 	uint32_t        last_prefetch_address;
+	uint32_t        scratch1;
+	uint32_t        scratch2;
 	uint16_t        *mem_pointers[NUM_MEM_AREAS];
 	code_ptr        resume_pc;
 	code_ptr        reset_handler;
 	m68k_options    *options;
 	void            *system;
+	void            *host_sp_entry;
+	void            *stack_storage[10];
 	m68k_breakpoint *breakpoints;
 	uint32_t        num_breakpoints;
 	uint32_t        bp_storage;
 	uint8_t         int_pending;
 	uint8_t         trace_pending;
 	uint8_t         should_return;
+	uint8_t         stack_storage_count;
 	uint8_t         ram_code_flags[];
 };
 
@@ -108,7 +116,7 @@
 void translate_m68k_stream(uint32_t address, m68k_context * context);
 void start_68k_context(m68k_context * context, uint32_t address);
 void resume_68k(m68k_context *context);
-void init_m68k_opts(m68k_options * opts, memmap_chunk * memmap, uint32_t num_chunks, uint32_t clock_divider, sync_fun sync_components);
+void init_m68k_opts(m68k_options * opts, memmap_chunk * memmap, uint32_t num_chunks, uint32_t clock_divider, sync_fun sync_components, int_ack_fun int_ack);
 m68k_context * init_68k_context(m68k_options * opts, m68k_reset_handler reset_handler);
 void m68k_reset(m68k_context * context);
 void m68k_options_free(m68k_options *opts);
--- a/m68k_core_x86.c	Fri Oct 13 22:44:36 2023 -0700
+++ b/m68k_core_x86.c	Mon Oct 16 23:30:04 2023 -0700
@@ -2584,7 +2584,7 @@
 	call(&native, opts->bp_stub);
 }
 
-void init_m68k_opts(m68k_options * opts, memmap_chunk * memmap, uint32_t num_chunks, uint32_t clock_divider, sync_fun sync_components)
+void init_m68k_opts(m68k_options * opts, memmap_chunk * memmap, uint32_t num_chunks, uint32_t clock_divider, sync_fun sync_components, int_ack_fun int_ack)
 {
 	memset(opts, 0, sizeof(*opts));
 	opts->gen.memmap = memmap;
@@ -2636,6 +2636,7 @@
 	opts->gen.scratch1 = RCX;
 	opts->gen.align_error_mask = 1;
 	opts->sync_components = sync_components;
+	opts->int_ack = int_ack;
 
 
 	opts->gen.native_code_map = malloc(sizeof(native_map_slot) * NATIVE_MAP_CHUNKS);
@@ -2649,6 +2650,9 @@
 	code_info *code = &opts->gen.code;
 	init_code_info(code);
 
+	opts->save_context_scratch = code->cur;
+	mov_rrdisp(code, opts->gen.scratch1, opts->gen.context_reg, offsetof(m68k_context, scratch1), SZ_D);
+	mov_rrdisp(code, opts->gen.scratch2, opts->gen.context_reg, offsetof(m68k_context, scratch2), SZ_D);
 	opts->gen.save_context = code->cur;
 	for (int i = 0; i < 5; i++)
 		if (opts->flag_regs[i] >= 0) {
@@ -2666,6 +2670,9 @@
 	mov_rrdisp(code, opts->gen.cycles, opts->gen.context_reg, offsetof(m68k_context, current_cycle), SZ_D);
 	retn(code);
 
+	opts->load_context_scratch = code->cur;
+	mov_rdispr(code, opts->gen.context_reg, offsetof(m68k_context, scratch1), opts->gen.scratch1, SZ_D);
+	mov_rdispr(code, opts->gen.context_reg, offsetof(m68k_context, scratch2), opts->gen.scratch2, SZ_D);
 	opts->gen.load_context = code->cur;
 	for (int i = 0; i < 5; i++)
 	{
@@ -2699,9 +2706,40 @@
 	mov_rdispr(code, RSP, 20, opts->gen.scratch2, SZ_D);
 	mov_rdispr(code, RSP, 24, opts->gen.context_reg, SZ_D);
 #endif
+	movzx_rdispr(code, opts->gen.context_reg, offsetof(m68k_context, stack_storage_count), opts->gen.scratch1, SZ_B, SZ_D);
+	mov_rrdisp(code, RSP, opts->gen.context_reg, offsetof(m68k_context, host_sp_entry), SZ_PTR);
+	cmp_ir(code, 0, opts->gen.scratch1, SZ_D);
+	code_ptr normal_start = code->cur + 1;
+	jcc(code, CC_Z, normal_start);
+	uint32_t stack_off_save = code->stack_off;
+	mov_rr(code, opts->gen.context_reg, opts->gen.scratch2, SZ_PTR);
+#ifdef X86_64
+	shl_ir(code, 3, opts->gen.scratch1, SZ_D);
+#else
+	shl_ir(code, 2, opts->gen.scratch1, SZ_D);
+#endif
+	add_ir(code, offsetof(m68k_context, stack_storage) - sizeof(void *), opts->gen.scratch2, SZ_PTR);
+	add_rr(code, opts->gen.scratch1, opts->gen.scratch2, SZ_PTR);
+	code_ptr loop_top = code->cur;
+	cmp_ir(code, 0, opts->gen.scratch1, SZ_D);
+	code_ptr loop_bot = code->cur + 1;
+	jcc(code, CC_Z, loop_bot);
+	sub_ir(code, sizeof(void*), opts->gen.scratch1, SZ_D);
+	mov_rindr(code, opts->gen.scratch2, opts->gen.cycles, SZ_PTR);
+	sub_ir(code, sizeof(void*), opts->gen.scratch2, SZ_PTR);
+	push_r(code, opts->gen.cycles);
+	jmp(code, loop_top);
+	*loop_bot = code->cur - (loop_bot + 1);
+	call_noalign(code, opts->load_context_scratch);
+	push_rdisp(code, opts->gen.context_reg, offsetof(m68k_context, resume_pc));
+	retn(code);
+	
+	code->stack_off = stack_off_save;
+	*normal_start = code->cur - (normal_start + 1);
 	call(code, opts->gen.load_context);
 	call_r(code, opts->gen.scratch2);
 	call(code, opts->gen.save_context);
+	mov_irdisp(code, 0, opts->gen.context_reg, offsetof(m68k_context, stack_storage_count), SZ_B);
 	restore_callee_save_regs(code);
 	retn(code);
 
@@ -2733,18 +2771,39 @@
 	code_ptr skip_sync = code->cur + 1;
 	jcc(code, CC_C, code->cur + 2);
 	opts->do_sync = code->cur;
-	push_r(code, opts->gen.scratch1);
-	push_r(code, opts->gen.scratch2);
-	call(code, opts->gen.save_context);
+	call(code, opts->save_context_scratch);
 	xor_rr(code, opts->gen.scratch1, opts->gen.scratch1, SZ_D);
 	call_args_abi(code, (code_ptr)opts->sync_components, 2, opts->gen.context_reg, opts->gen.scratch1);
 	mov_rr(code, RAX, opts->gen.context_reg, SZ_PTR);
-	call(code, opts->gen.load_context);
-	pop_r(code, opts->gen.scratch2);
-	pop_r(code, opts->gen.scratch1);
+	cmp_irdisp(code, 0, RAX, offsetof(m68k_context, should_return), SZ_B);
+	code_ptr do_return = code->cur + 1;
+	jcc(code, CC_NZ, do_return);
+	call(code, opts->load_context_scratch);
 	*skip_sync = code->cur - (skip_sync+1);
 	retn(code);
-
+	stack_off_save = code->stack_off;
+	*do_return = code->cur - (do_return + 1);
+	pop_r(code, opts->gen.scratch1);
+	mov_rrdisp(code, opts->gen.scratch1, opts->gen.context_reg, offsetof(m68k_context, resume_pc), SZ_PTR);
+	mov_rdispr(code, opts->gen.context_reg, offsetof(m68k_context, host_sp_entry), opts->gen.scratch2, SZ_PTR);
+	mov_rr(code, opts->gen.context_reg, opts->aregs[7], SZ_PTR);
+	xor_rr(code, opts->gen.scratch1, opts->gen.scratch1, SZ_B);
+	add_ir(code, offsetof(m68k_context, stack_storage), opts->aregs[7], SZ_PTR);
+	loop_top  = code->cur;
+	cmp_rr(code, opts->gen.scratch2, RSP, SZ_PTR);
+	code_ptr done_stack_save = code->cur + 1;
+	jcc(code, CC_Z, done_stack_save);
+	pop_r(code, opts->gen.cycles);
+	add_ir(code, 1, opts->gen.scratch1, SZ_B);
+	mov_rrind(code, opts->gen.cycles, opts->aregs[7], SZ_PTR);
+	add_ir(code, sizeof(void*), opts->aregs[7], SZ_PTR);
+	jmp(code, loop_top);
+	*done_stack_save = code->cur - (done_stack_save + 1);
+	mov_rrdisp(code, opts->gen.scratch1, opts->gen.context_reg, offsetof(m68k_context, stack_storage_count), SZ_B);
+	restore_callee_save_regs(code);
+	retn(code);
+	code->stack_off = stack_off_save;
+	
 	opts->gen.handle_code_write = (code_ptr)m68k_handle_code_write;
 
 	check_alloc_code(code, 256);
@@ -3107,32 +3166,12 @@
 	areg_to_native(opts, 7, opts->gen.scratch2);
 	call(code, opts->write_16);
 	//interrupt ack cycle
-	//the Genesis responds to these exclusively with !VPA which means its a slow
-	//6800 operation. documentation says these can take between 10 and 19 cycles.
-	//actual results measurements seem to suggest it's actually between 9 and 18
-	//WARNING: this code might break with register assignment changes
-	//save RDX
-	push_r(code, RDX);
-	//save cycle count
-	mov_rr(code, RAX, opts->gen.scratch1, SZ_D);
-	//clear top doubleword of dividend
-	xor_rr(code, RDX, RDX, SZ_D);
-	//set divisor to clock divider
-	mov_ir(code, opts->gen.clock_divider, opts->gen.scratch2, SZ_D);
-	div_r(code, opts->gen.scratch2, SZ_D);
-	//discard remainder
-	xor_rr(code, RDX, RDX, SZ_D);
-	//set divisor to 10, the period of E
-	mov_ir(code, 10, opts->gen.scratch2, SZ_D);
-	div_r(code, opts->gen.scratch2, SZ_D);
-	//delay will be (9 + 4 + the remainder) * clock_divider
-	//the extra 4 is to cover the idle bus period after the ack
-	add_ir(code, 9 + 4, RDX, SZ_D);
-	mov_ir(code, opts->gen.clock_divider, RAX, SZ_D);
-	mul_r(code, RDX, SZ_D);
-	pop_r(code, RDX);
-	//add saved cycle count to result
-	add_rr(code, opts->gen.scratch1, RAX, SZ_D);
+	cycles(&opts->gen, 4); //base interrupt ack cycle count
+	call(code, opts->gen.save_context);
+	call_args_abi(code, (code_ptr)opts->int_ack, 1, opts->gen.context_reg);
+	mov_rr(code, RAX, opts->gen.context_reg, SZ_PTR);
+	call(code, opts->gen.load_context);
+	cycles(&opts->gen, 4); //idle period after int ack
 
 	//update status register
 	and_irdisp(code, 0x78, opts->gen.context_reg, offsetof(m68k_context, status), SZ_B);
@@ -3154,8 +3193,6 @@
 	//grab saved interrupt number
 	xor_rr(code, opts->gen.scratch1, opts->gen.scratch1, SZ_D);
 	mov_rdispr(code, opts->gen.context_reg, offsetof(m68k_context, int_pending), opts->gen.scratch1, SZ_B);
-	//ack the interrupt (happens earlier on hardware, but shouldn't be an observable difference)
-	mov_rrdisp(code, opts->gen.scratch1, opts->gen.context_reg, offsetof(m68k_context, int_ack), SZ_W);
 	//calculate the vector address
 	shl_ir(code, 2, opts->gen.scratch1, SZ_D);
 	add_ir(code, 0x60, opts->gen.scratch1, SZ_D);
--- a/segacd.c	Fri Oct 13 22:44:36 2023 -0700
+++ b/segacd.c	Mon Oct 16 23:30:04 2023 -0700
@@ -11,7 +11,11 @@
 
 #define SCD_MCLKS 50000000
 #define SCD_PERIPH_RESET_CLKS (SCD_MCLKS / 10)
-#define TIMER_TICK_CLKS 1536
+#define TIMER_TICK_CLKS 1536/*1792*/
+
+//TODO: do some logic analyzer captuers to get actual values
+#define REFRESH_INTERVAL 259
+#define REFRESH_DELAY 2
 
 enum {
 	GA_SUB_CPU_CTRL,
@@ -632,15 +636,13 @@
 	}
 	context->target_cycle = context->sync_cycle < context->int_cycle ? context->sync_cycle : context->int_cycle;
 	if (context->int_cycle == cdc_cycle && context->int_num == 5) {
-		uint32_t before = context->target_cycle - 2 * cd->cdc.clock_step;
+		uint32_t before = cdc_cycle - cd->m68k->options->gen.clock_divider * 158; //divs worst case
 		if (before < context->target_cycle) {
-			if (before > context->current_cycle) {
+			while (before <= context->current_cycle) {
+				before += cd->cdc.clock_step;
+			}
+			if (before < context->target_cycle) {
 				context->target_cycle = context->sync_cycle = before;
-			} else {
-				before = context->target_cycle - cd->cdc.clock_step;
-				if (before > context->current_cycle) {
-					context->target_cycle = context->sync_cycle = before;
-				}
 			}
 		}
 	}
@@ -650,6 +652,15 @@
 {
 	m68k_context *m68k = vcontext;
 	segacd_context *cd = m68k->system;
+	uint32_t before_cycle = m68k->current_cycle - m68k->options->gen.clock_divider * 4;
+	if (before_cycle >= cd->last_refresh_cycle) {
+		uint32_t num_refresh = (before_cycle - cd->last_refresh_cycle) / REFRESH_INTERVAL;
+		uint32_t num_full = (m68k->current_cycle - cd->last_refresh_cycle) / REFRESH_INTERVAL;
+		cd->last_refresh_cycle = cd->last_refresh_cycle + num_full * REFRESH_INTERVAL;
+		m68k->current_cycle += num_refresh * REFRESH_DELAY;
+	}
+	
+	
 	uint32_t reg = address >> 1;
 	switch (reg)
 	{
@@ -674,7 +685,7 @@
 		if (dst == DST_SUB_CPU) {
 			if (cd->gate_array[GA_CDC_CTRL] & BIT_DSR) {
 				cd->gate_array[GA_CDC_CTRL] &= ~BIT_DSR;
-				lc8951_resume_transfer(&cd->cdc, cd->cdc.cycle);
+				lc8951_resume_transfer(&cd->cdc);
 			}
 			calculate_target_cycle(cd->m68k);
 
@@ -738,6 +749,14 @@
 {
 	m68k_context *m68k = vcontext;
 	segacd_context *cd = m68k->system;
+	uint32_t before_cycle = m68k->current_cycle - m68k->options->gen.clock_divider * 4;
+	if (before_cycle >= cd->last_refresh_cycle) {
+		uint32_t num_refresh = (before_cycle - cd->last_refresh_cycle) / REFRESH_INTERVAL;
+		uint32_t num_full = (m68k->current_cycle - cd->last_refresh_cycle) / REFRESH_INTERVAL;
+		cd->last_refresh_cycle = cd->last_refresh_cycle + num_full * REFRESH_INTERVAL;
+		m68k->current_cycle += num_refresh * REFRESH_DELAY;
+	}
+	
 	uint32_t reg = address >> 1;
 	switch (reg)
 	{
@@ -831,7 +850,7 @@
 				lc8951_set_dma_multiple(&cd->cdc, 6);
 			}
 			if ((old_dest < DST_MAIN_CPU || old_dest == 6) && dest >= DST_MAIN_CPU && dest != 6) {
-				lc8951_resume_transfer(&cd->cdc, m68k->current_cycle);
+				lc8951_resume_transfer(&cd->cdc);
 			}
 			calculate_target_cycle(m68k);
 		}
@@ -878,6 +897,7 @@
 	case GA_TIMER:
 		timers_run(cd, m68k->current_cycle);
 		cd->gate_array[reg] = value & 0xFF;
+		cd->timer_value = 0;
 		calculate_target_cycle(m68k);
 		break;
 	case GA_INT_MASK:
@@ -1119,10 +1139,6 @@
 	rf5c164_run(&cd->pcm, cycle);
 }
 
-//TODO: do some logic analyzer captuers to get actual values
-#define REFRESH_INTERVAL 256
-#define REFRESH_DELAY 2
-
 static m68k_context *sync_components(m68k_context * context, uint32_t address)
 {
 	segacd_context *cd = context->system;
@@ -1146,7 +1162,15 @@
 		}
 		cd->m68k_pc = address;
 	}
-	switch (context->int_ack)
+	calculate_target_cycle(context);
+	return context;
+}
+
+static m68k_context *int_ack(m68k_context *context)
+{	
+	segacd_context *cd = context->system;
+	scd_peripherals_run(cd, context->current_cycle);
+	switch (context->int_pending)
 	{
 	case 1:
 		cd->graphics_int_cycle = CYCLE_NEVER;
@@ -1167,8 +1191,15 @@
 		cd->cdd.subcode_int_pending = 0;
 		break;
 	}
-	context->int_ack = 0;
-	calculate_target_cycle(context);
+	//the Sega CD responds to these exclusively with !VPA which means its a slow
+	//6800 operation. documentation says these can take between 10 and 19 cycles.
+	//actual results measurements seem to suggest it's actually between 9 and 18
+	//Base 68K core has added 4 cycles for a normal int ack cycle already
+	//We add 5 + the current cycle count (in 68K cycles) mod 10 to simulate the
+	//additional variable delay from the use of the 6800 cycle
+	uint32_t cycle_count = context->current_cycle / context->options->gen.clock_divider;
+	context->current_cycle += 5 + (cycle_count % 10);
+	
 	return context;
 }
 
@@ -1177,7 +1208,10 @@
 	uint8_t m68k_run = !can_main_access_prog(cd);
 	while (cycle > cd->m68k->current_cycle) {
 		if (m68k_run && !cd->sub_paused_wordram) {
-			uint32_t start = cd->m68k->current_cycle;
+			uint32_t num_refresh = (cd->m68k->current_cycle - cd->last_refresh_cycle) / REFRESH_INTERVAL;
+			cd->last_refresh_cycle = cd->last_refresh_cycle + num_refresh * REFRESH_INTERVAL;
+			cd->m68k->current_cycle += num_refresh * REFRESH_DELAY;
+
 
 			cd->m68k->sync_cycle = cd->enter_debugger ? cd->m68k->current_cycle + 1 : cycle;
 			if (cd->need_reset) {
@@ -1237,6 +1271,7 @@
 static uint16_t main_gate_read16(uint32_t address, void *vcontext)
 {
 	m68k_context *m68k = vcontext;
+	gen_update_refresh_free_access(m68k);
 	genesis_context *gen = m68k->system;
 	segacd_context *cd = gen->expansion;
 	uint32_t scd_cycle = gen_cycle_to_scd(m68k->current_cycle, gen);
@@ -1270,10 +1305,7 @@
 		if (dst == DST_MAIN_CPU) {
 			if (cd->gate_array[GA_CDC_CTRL] & BIT_DSR) {
 				cd->gate_array[GA_CDC_CTRL] &= ~BIT_DSR;
-				//Using the sub CPU's cycle count here is a bit of a hack
-				//needed to ensure the interrupt does not get triggered prematurely
-				//because the sub CPU execution granularity is too high
-				lc8951_resume_transfer(&cd->cdc, cd->m68k->current_cycle);
+				lc8951_resume_transfer(&cd->cdc);
 			} else {
 				printf("Read of CDC host data with DSR clear at %u\n", scd_cycle);
 			}
@@ -1328,14 +1360,28 @@
 static void *main_gate_write16(uint32_t address, void *vcontext, uint16_t value)
 {
 	m68k_context *m68k = vcontext;
+	gen_update_refresh_free_access(m68k);
 	genesis_context *gen = m68k->system;
 	segacd_context *cd = gen->expansion;
 	uint32_t scd_cycle = gen_cycle_to_scd(m68k->current_cycle, gen);
-	scd_run(cd, scd_cycle);
 	uint32_t reg = (address & 0x1FF) >> 1;
+	if (reg != GA_SUB_CPU_CTRL) {
+		scd_run(cd, scd_cycle);
+	}
 	switch (reg)
 	{
 	case GA_SUB_CPU_CTRL: {
+		if ((value & BIT_IFL2) && (cd->gate_array[GA_INT_MASK] & BIT_MASK_IEN2)) {
+			if (cd->int2_cycle != CYCLE_NEVER) {
+				scd_run(cd, scd_cycle - 4 * cd->m68k->options->gen.clock_divider);
+				while (cd->int2_cycle != CYCLE_NEVER && cd->m68k->current_cycle < scd_cycle) {
+					scd_run(cd, cd->m68k->current_cycle + cd->m68k->options->gen.clock_divider);
+				}
+			}
+			cd->int2_cycle = scd_cycle;
+			
+		}
+		scd_run(cd, scd_cycle);
 		uint8_t old_access = can_main_access_prog(cd);
 		cd->busreq = value & BIT_SBRQ;
 		uint8_t old_reset = cd->reset;
@@ -1343,9 +1389,6 @@
 		if (cd->reset && !old_reset) {
 			cd->need_reset = 1;
 		}
-		if (value & BIT_IFL2) {
-			cd->int2_cycle = scd_cycle;
-		}
 		/*cd->gate_array[reg] &= 0x7FFF;
 		cd->gate_array[reg] |= value & 0x8000;*/
 		uint8_t new_access = can_main_access_prog(cd);
@@ -1362,7 +1405,7 @@
 			dump_prog_ram(cd);
 			uint16_t dst = cd->gate_array[GA_CDC_CTRL] >> 8 & 0x7;
 			if (dst == DST_PROG_RAM) {
-				lc8951_resume_transfer(&cd->cdc, cd->cdc.cycle);
+				lc8951_resume_transfer(&cd->cdc);
 			}
 		}
 		break;
@@ -1395,7 +1438,7 @@
 
 				uint16_t dst = cd->gate_array[GA_CDC_CTRL] >> 8 & 0x7;
 				if (dst == DST_WORD_RAM) {
-					lc8951_resume_transfer(&cd->cdc, cd->cdc.cycle);
+					lc8951_resume_transfer(&cd->cdc);
 				}
 
 				m68k_invalidate_code_range(m68k, cd->base + 0x200000, cd->base + 0x240000);
@@ -1587,7 +1630,7 @@
 	sub_cpu_map[0].buffer = sub_cpu_map[1].buffer = cd->prog_ram;
 	sub_cpu_map[4].buffer = cd->bram;
 	m68k_options *mopts = malloc(sizeof(m68k_options));
-	init_m68k_opts(mopts, sub_cpu_map, sizeof(sub_cpu_map) / sizeof(*sub_cpu_map), 4, sync_components);
+	init_m68k_opts(mopts, sub_cpu_map, sizeof(sub_cpu_map) / sizeof(*sub_cpu_map), 4, sync_components, int_ack);
 	cd->m68k = init_68k_context(mopts, NULL);
 	cd->m68k->system = cd;
 	cd->int2_cycle = CYCLE_NEVER;