Skip to content

Commit 67e1dfc

Browse files
authored
Extended ASM got fussy when using different optimizations. ie. HWDT (#7816)
resets. It seems you should not use input registers for scratch registers. Add an extra output register instead. No code size increase. Light refactoring for readability Added "C" reference code for Extended ASM Save two cycles by loading a0 early in exc-c-wrapper-handler.S Use optimization O2 Net change in size, 0 bytes with optimization. Save 4 bytes w/o Optimization. With changes and "O2" save 3 cycles on write and 6 cycles on read.
1 parent 2f5979f commit 67e1dfc

File tree

2 files changed

+90
-55
lines changed

2 files changed

+90
-55
lines changed

cores/esp8266/core_esp8266_non32xfer.cpp

+87-54
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
*/
1919

2020
/*
21-
* This exception handler, allows for byte or short accesses to iRAM or PROGMEM
22-
* to succeed without causing a crash. It is still preferred to use the xxx_P
23-
* macros whenever possible, since they are probably 30x faster than this
24-
* exception handler method.
21+
* This exception handler handles EXCCAUSE_LOAD_STORE_ERROR. It allows for a
22+
* byte or short access to iRAM or PROGMEM to succeed without causing a crash.
23+
* When reading, it is still preferred to use the xxx_P macros when possible
24+
* since they are probably 30x faster than this exception handler method.
2525
*
2626
* Code taken directly from @pvvx's public domain code in
2727
* https://github.com/pvvx/esp8266web/blob/master/app/sdklib/system/app_main.c
@@ -37,6 +37,16 @@
3737
#include <Schedule.h>
3838
#include <debug.h>
3939

40+
// All of these optimization were tried and now work
41+
// These results were from irammem.ino using GCC 10.2
42+
// DRAM reference uint16 9 AVG cycles/transfer
43+
// #pragma GCC optimize("O0") // uint16, 289 AVG cycles/transfer, IRAM: +180
44+
// #pragma GCC optimize("O1") // uint16, 241 AVG cycles/transfer, IRAM: +16
45+
#pragma GCC optimize("O2") // uint16, 230 AVG cycles/transfer, IRAM: +4
46+
// #pragma GCC optimize("O3") // uint16, 230 AVG cycles/transfer, IRAM: +4
47+
// #pragma GCC optimize("Ofast") // uint16, 230 AVG cycles/transfer, IRAM: +4
48+
// #pragma GCC optimize("Os") // uint16, 233 AVG cycles/transfer, IRAM: 27556 +0
49+
4050
extern "C" {
4151

4252
#define LOAD_MASK 0x00f00fu
@@ -50,32 +60,55 @@ extern "C" {
5060

5161
static fn_c_exception_handler_t old_c_handler = NULL;
5262

53-
static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause)
63+
static
64+
IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause)
5465
{
5566
do {
5667
/*
57-
Had to split out some of the asm, compiler was reusing a register that it
58-
needed later. A crash would come or go away with the slightest unrelated
59-
changes elsewhere in the function.
60-
61-
Register a15 was used for epc1, then clobbered for rsr. Maybe an
62-
__asm("":::"memory") before starting the asm would help for these cases.
63-
For this instance moved setting epc1 closer to where it was used.
64-
Edit. "&" on output register would have resolved the problem.
65-
Refactored to reduce and consolidate register usage.
68+
In adapting the public domain version, a crash would come or go away with
69+
the slightest unrelated changes elsewhere in the function. Observed that
70+
register a15 was used for epc1, then clobbered by `rsr.` I now believe a
71+
"&" on the output register would have resolved the problem.
72+
73+
However, I have refactored the Extended ASM to reduce and consolidate
74+
register usage and corrected the issue.
75+
76+
The positioning of the Extended ASM block (as early as possible in the
77+
compiled function) is in part controlled by the immediate need for
78+
output variable `insn`. This placement aids in getting excvaddr read as
79+
early as possible.
6680
*/
67-
uint32_t insn;
68-
__asm(
69-
"movi %0, ~3\n\t" /* prepare a mask for the EPC */
70-
"and %0, %0, %1\n\t" /* apply mask for 32bit aligned base */
71-
"ssa8l %1\n\t" /* set up shift register for src op */
72-
"l32i %1, %0, 0\n\t" /* load part 1 */
73-
"l32i %0, %0, 4\n\t" /* load part 2 */
74-
"src %0, %0, %1\n\t" /* right shift to get faulting instruction */
75-
:"=&r"(insn)
76-
:"r"(ef->epc)
77-
:
78-
);
81+
uint32_t insn, excvaddr;
82+
#if 1
83+
{
84+
uint32_t tmp;
85+
__asm__ (
86+
"rsr.excvaddr %[vaddr]\n\t" /* Read faulting address as early as possible */
87+
"movi.n %[tmp], ~3\n\t" /* prepare a mask for the EPC */
88+
"and %[tmp], %[tmp], %[epc]\n\t" /* apply mask for 32-bit aligned base */
89+
"ssa8l %[epc]\n\t" /* set up shift register for src op */
90+
"l32i %[insn], %[tmp], 0\n\t" /* load part 1 */
91+
"l32i %[tmp], %[tmp], 4\n\t" /* load part 2 */
92+
"src %[insn], %[tmp], %[insn]\n\t" /* right shift to get faulting instruction */
93+
: [vaddr]"=&r"(excvaddr), [insn]"=&r"(insn), [tmp]"=&r"(tmp)
94+
: [epc]"r"(ef->epc) :);
95+
}
96+
97+
#else
98+
{
99+
__asm__ __volatile__ ("rsr.excvaddr %0;" : "=r"(excvaddr):: "memory");
100+
/*
101+
"C" reference code for the ASM to document intent.
102+
May also prove useful when issolating possible issues with Extended ASM,
103+
optimizations, new compilers, etc.
104+
*/
105+
uint32_t epc = ef->epc;
106+
uint32_t *pWord = (uint32_t *)(epc & ~3);
107+
uint64_t big_word = ((uint64_t)pWord[1] << 32) | pWord[0];
108+
uint32_t pos = (epc & 3) * 8;
109+
insn = (uint32_t)(big_word >>= pos);
110+
}
111+
#endif
79112

80113
uint32_t what = insn & LOAD_MASK;
81114
uint32_t valmask = 0;
@@ -102,10 +135,6 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef,
102135
--regno; /* account for skipped a1 in exception_frame */
103136
}
104137

105-
uint32_t excvaddr;
106-
/* read out the faulting address */
107-
__asm("rsr %0, EXCVADDR;" :"=r"(excvaddr)::);
108-
109138
#ifdef DEBUG_ESP_MMU
110139
/* debug option to validate address so we don't hide memory access bugs in APP */
111140
if (mmu_is_iram((void *)excvaddr) || (is_read && mmu_is_icache((void *)excvaddr))) {
@@ -114,31 +143,34 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef,
114143
continue; /* fail */
115144
}
116145
#endif
117-
118-
if (is_read) {
119-
/* Load, shift and mask down to correct size */
120-
uint32_t val = (*(uint32_t *)(excvaddr & ~0x3));
121-
val >>= (excvaddr & 0x3) * 8;
122-
val &= valmask;
123-
124-
/* Sign-extend for L16SI, if applicable */
125-
if (what == L16SI_MATCH && (val & 0x8000)) {
126-
val |= 0xffff0000;
146+
{
147+
uint32_t *pWord = (uint32_t *)(excvaddr & ~0x3);
148+
uint32_t pos = (excvaddr & 0x3) * 8;
149+
uint32_t mem_val = *pWord;
150+
151+
if (is_read) {
152+
/* shift and mask down to correct size */
153+
mem_val >>= pos;
154+
mem_val &= valmask;
155+
156+
/* Sign-extend for L16SI, if applicable */
157+
if (what == L16SI_MATCH && (mem_val & 0x8000)) {
158+
mem_val |= 0xffff0000;
159+
}
160+
161+
ef->a_reg[regno] = mem_val; /* carry out the load */
162+
163+
} else { /* is write */
164+
uint32_t val = ef->a_reg[regno]; /* get value to store from register */
165+
val <<= pos;
166+
valmask <<= pos;
167+
val &= valmask;
168+
169+
/* mask out field, and merge */
170+
mem_val &= (~valmask);
171+
mem_val |= val;
172+
*pWord = mem_val; /* carry out the store */
127173
}
128-
129-
ef->a_reg[regno] = val; /* carry out the load */
130-
131-
} else { /* is write */
132-
uint32_t val = ef->a_reg[regno]; /* get value to store from register */
133-
val <<= (excvaddr & 0x3) * 8;
134-
valmask <<= (excvaddr & 0x3) * 8;
135-
val &= valmask;
136-
137-
/* Load, mask out field, and merge */
138-
uint32_t dst_val = (*(uint32_t *)(excvaddr & ~0x3));
139-
dst_val &= (~valmask);
140-
dst_val |= val;
141-
(*(uint32_t *)(excvaddr & ~0x3)) = dst_val; /* carry out the store */
142174
}
143175

144176
ef->epc += 3; /* resume at following instruction */
@@ -201,6 +233,7 @@ static void _set_exception_handler_wrapper(int cause) {
201233
}
202234
}
203235

236+
void install_non32xfer_exception_handler(void) __attribute__((weak));
204237
void install_non32xfer_exception_handler(void) {
205238
if (NULL == old_c_handler) {
206239
// Set the "C" exception handler the wrapper will call

cores/esp8266/exc-c-wrapper-handler.S

+3-1
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ _xtos_c_wrapper_handler:
189189
// Restore special registers
190190
l32i a14, a1, UEXC_sar
191191

192+
// load early - saves two cycles - @mhightower83
193+
movi a0, _xtos_return_from_exc
194+
192195
// For compatibility with Arduino interrupt architecture, we keep interrupts 100%
193196
// disabled.
194197
// /*
@@ -201,7 +204,6 @@ _xtos_c_wrapper_handler:
201204
// rsil a12, 1 // XCHAL_EXCM_LEVEL
202205
rsil a12, 15 // All levels blocked.
203206
wsr a14, SAR
204-
movi a0, _xtos_return_from_exc
205207
jx a0
206208

207209
/* FIXME: what about _GeneralException ? */

0 commit comments

Comments
 (0)