- New src/fault_handler.c + include/fault_handler.h:
- HardFault/MemManage/BusFault/UsageFault naked ISR stubs with
Cortex-M7 stack-frame capture (R0-R3, LR, PC, xPSR, CFSR, HFSR,
MMFAR, BFAR, SP) and NVIC_SystemReset()
- .noinit SRAM capture ring survives soft reset; persisted to flash
sector 7 (0x08060000, 8x64-byte slots) on subsequent boot
- MPU Region 0 stack guard (32 B at __stack_end, no-access) ->
MemManage fault detected as FAULT_STACK_OVF
- Brownout detect via RCC_CSR_BORRSTF on boot -> FAULT_BROWNOUT
- Watchdog reset detection delegates to existing watchdog.c
- LED blink codes on LED2 (PC14, active-low) for 10 s post-recovery:
HARDFAULT=3, WATCHDOG=2, BROWNOUT=1, STACK_OVF=4 fast blinks
- fault_led_tick(), fault_log_read(), fault_log_get_count(),
fault_get_last_type(), fault_log_clear(), FAULT_ASSERT() macro
- jlink.h: add JLINK_CMD_FAULT_LOG_GET (0x0F), JLINK_TLM_FAULT_LOG
(0x86), jlink_tlm_fault_log_t (20 bytes), fault_log_req in JLinkState,
jlink_send_fault_log() declaration
- jlink.c: dispatch JLINK_CMD_FAULT_LOG_GET; implement
jlink_send_fault_log() (26-byte CRC16-XModem framed response)
- main.c: call fault_handler_init() first in main(); send fault log
TLM on boot if prior fault recorded; fault_led_tick() in main loop;
handle fault_log_req flag to respond to Jetson queries
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
458 lines
15 KiB
C
458 lines
15 KiB
C
#include "fault_handler.h"
|
||
#include "config.h"
|
||
#include "pid_flash.h"
|
||
#include "stm32f7xx_hal.h"
|
||
#include <string.h>
|
||
#include <stdio.h>
|
||
|
||
/*
|
||
* fault_handler.c — STM32F7 fault detection and recovery (Issue #565)
|
||
*
|
||
* Recovery flow:
|
||
* Fault ISR (naked) → _capture_and_reset() captures registers into .noinit
|
||
* SRAM → sets FAULT_SRAM_MAGIC → NVIC_SystemReset().
|
||
* On next boot: fault_handler_init() sees FAULT_SRAM_MAGIC → persists to
|
||
* flash log → prints CDC dump → starts LED blink code.
|
||
*
|
||
* No flash writes occur inside fault ISRs. All flash operations happen safely
|
||
* in the normal boot context, well before safety_init() / IWDG start.
|
||
*/
|
||
|
||
/* ---- .noinit SRAM (preserved across NVIC_SystemReset) ---- */
|
||
/*
|
||
* GCC startup code only zeroes .bss and initialises .data. Variables in
|
||
* .noinit are left untouched. The magic word guards against cold-boot garbage.
|
||
*/
|
||
#define FAULT_SRAM_MAGIC 0xFADE5A01u
|
||
#define RESET_COUNT_MAGIC 0x1234ABCDu
|
||
|
||
static __attribute__((section(".noinit"))) volatile uint32_t s_fault_magic;
|
||
static __attribute__((section(".noinit"))) volatile fault_log_entry_t s_fault_sram;
|
||
static __attribute__((section(".noinit"))) volatile uint32_t s_reset_count_magic;
|
||
static __attribute__((section(".noinit"))) volatile uint32_t s_reset_count;
|
||
|
||
/* ---- LED blink sequencer ---- */
|
||
/*
|
||
* Each pattern is a 16-bit bitmask; bit 15 = first step.
|
||
* One step = period_ms milliseconds. LED2 (PC14) is active-low.
|
||
*/
|
||
typedef struct {
|
||
uint16_t pattern; /* bitmask: 1 = LED on */
|
||
uint8_t steps; /* number of valid bits to cycle */
|
||
uint16_t period_ms; /* ms per step */
|
||
} LedBlink;
|
||
|
||
/*
|
||
* Pattern table indexed by FaultType (0..8).
|
||
* NONE = silent
|
||
* HARDFAULT = 1010 1010 1010 1010 (3 fast blinks, 100 ms)
|
||
* WATCHDOG = 1111 0000 1111 0000 (2 slow pulses, 150 ms × 8 steps = 1.2 s)
|
||
* BROWNOUT = 1111 1111 0000 0000 (1 long pulse, 100 ms × 16 = 1.6 s)
|
||
* STACK_OVF = 1110 1110 1110 1110 (4 short bursts, 100 ms)
|
||
* BUS_FAULT = 1010 1111 1100 0000 (3+1 pattern)
|
||
* USAGE_FAULT = 1010 0000 0000 0000 (2 fast blinks)
|
||
* MEM_FAULT = 1010 1010 1000 0000 (3 blinks, slower tail)
|
||
* ASSERT = 1101 1011 0000 0000 (SOS-like)
|
||
*/
|
||
static const LedBlink s_blink_table[] = {
|
||
/* FAULT_NONE */ { 0x0000u, 16, 100 },
|
||
/* FAULT_HARDFAULT */ { 0xAAAAu, 16, 100 },
|
||
/* FAULT_WATCHDOG */ { 0xF0F0u, 16, 150 },
|
||
/* FAULT_BROWNOUT */ { 0xFF00u, 16, 100 },
|
||
/* FAULT_STACK_OVF */ { 0xEEEEu, 16, 100 },
|
||
/* FAULT_BUS_FAULT */ { 0xAFC0u, 16, 100 },
|
||
/* FAULT_USAGE_FAULT */ { 0xA000u, 16, 100 },
|
||
/* FAULT_MEM_FAULT */ { 0xAA80u, 16, 100 },
|
||
/* FAULT_ASSERT */ { 0xDB00u, 16, 100 },
|
||
};
|
||
#define BLINK_TABLE_SIZE (sizeof(s_blink_table) / sizeof(s_blink_table[0]))
|
||
|
||
static FaultType s_led_fault = FAULT_NONE;
|
||
static uint32_t s_led_start = 0;
|
||
static uint32_t s_led_last = 0;
|
||
static uint8_t s_led_step = 0;
|
||
|
||
/* ------------------------------------------------------------------ */
|
||
/* Flash helpers */
|
||
/* ------------------------------------------------------------------ */
|
||
|
||
static uint32_t _slot_addr(uint8_t idx)
|
||
{
|
||
return FAULT_LOG_BASE_ADDR + (uint32_t)idx * FAULT_LOG_ENTRY_SIZE;
|
||
}
|
||
|
||
static bool _slot_empty(uint8_t idx)
|
||
{
|
||
/* An erased 32-bit word reads as 0xFFFFFFFF */
|
||
const uint32_t *p = (const uint32_t *)_slot_addr(idx);
|
||
return (*p == 0xFFFFFFFFu);
|
||
}
|
||
|
||
static int _free_slot(void)
|
||
{
|
||
for (uint8_t i = 0; i < FAULT_LOG_MAX_ENTRIES; i++) {
|
||
if (_slot_empty(i)) return (int)i;
|
||
}
|
||
return -1;
|
||
}
|
||
|
||
static bool _erase_sector7(void)
|
||
{
|
||
FLASH_EraseInitTypeDef er = {0};
|
||
er.TypeErase = FLASH_TYPEERASE_SECTORS;
|
||
er.Sector = FLASH_SECTOR_7;
|
||
er.NbSectors = 1;
|
||
er.VoltageRange = FLASH_VOLTAGE_RANGE_3;
|
||
uint32_t err = 0;
|
||
return HAL_FLASHEx_Erase(&er, &err) == HAL_OK;
|
||
}
|
||
|
||
/*
|
||
* Write fault entry to the next free flash slot.
|
||
* When all 8 slots are occupied: erase sector 7, restore PID if valid,
|
||
* then write entry at slot 0. Sector 7 erase stalls CPU ~1 s — only
|
||
* called from fault_handler_init() before IWDG is started.
|
||
*/
|
||
static bool _fault_log_write(const fault_log_entry_t *entry)
|
||
{
|
||
int slot = _free_slot();
|
||
|
||
/* ---- Handle full log: erase sector 7 ---- */
|
||
if (slot < 0) {
|
||
float kp, ki, kd;
|
||
bool pid_ok = pid_flash_load(&kp, &ki, &kd);
|
||
|
||
HAL_FLASH_Unlock();
|
||
bool erased = _erase_sector7();
|
||
HAL_FLASH_Lock();
|
||
|
||
if (!erased) return false;
|
||
|
||
if (pid_ok) {
|
||
/* pid_flash_save() manages its own unlock/lock */
|
||
pid_flash_save(kp, ki, kd);
|
||
}
|
||
slot = 0;
|
||
}
|
||
|
||
/* ---- Write 64 bytes (16 × 32-bit words) to chosen slot ---- */
|
||
uint32_t addr = _slot_addr((uint8_t)slot);
|
||
const uint32_t *words = (const uint32_t *)entry;
|
||
|
||
HAL_FLASH_Unlock();
|
||
bool ok = true;
|
||
for (uint8_t w = 0; w < FAULT_LOG_ENTRY_SIZE / 4u; w++) {
|
||
if (HAL_FLASH_Program(FLASH_TYPEPROGRAM_WORD,
|
||
addr + (uint32_t)w * 4u, words[w]) != HAL_OK) {
|
||
ok = false;
|
||
break;
|
||
}
|
||
}
|
||
HAL_FLASH_Lock();
|
||
return ok;
|
||
}
|
||
|
||
/* ------------------------------------------------------------------ */
|
||
/* LED blink */
|
||
/* ------------------------------------------------------------------ */
|
||
|
||
static void _led_start(FaultType type)
|
||
{
|
||
s_led_fault = type;
|
||
s_led_start = HAL_GetTick();
|
||
s_led_last = s_led_start;
|
||
s_led_step = 0;
|
||
}
|
||
|
||
/* ------------------------------------------------------------------ */
|
||
/* Public API */
|
||
/* ------------------------------------------------------------------ */
|
||
|
||
void fault_mpu_guard_init(void)
|
||
{
|
||
/*
|
||
* Configure MPU Region 0 as a 32-byte no-access guard page at
|
||
* __stack_end (lowest address of the main stack). The stack grows
|
||
* downward; when it overflows into this region a MemManage fault fires.
|
||
*
|
||
* MPU RASR SIZE field = log2(region_bytes) - 1 = log2(32) - 1 = 4.
|
||
* AP = 0b000 → no access in any mode.
|
||
*/
|
||
extern uint32_t __stack_end; /* defined in linker script */
|
||
|
||
HAL_MPU_Disable();
|
||
|
||
MPU_Region_InitTypeDef r = {0};
|
||
r.Enable = MPU_REGION_ENABLE;
|
||
r.Number = MPU_REGION_NUMBER0;
|
||
r.BaseAddress = (uint32_t)&__stack_end;
|
||
r.Size = MPU_REGION_SIZE_32B;
|
||
r.SubRegionDisable = 0x00u;
|
||
r.TypeExtField = MPU_TEX_LEVEL0;
|
||
r.AccessPermission = MPU_REGION_NO_ACCESS;
|
||
r.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
|
||
r.IsShareable = MPU_ACCESS_NOT_SHAREABLE;
|
||
r.IsCacheable = MPU_ACCESS_NOT_CACHEABLE;
|
||
r.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE;
|
||
HAL_MPU_ConfigRegion(&r);
|
||
|
||
/* Enable MPU with default memory map for privileged access */
|
||
HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT);
|
||
|
||
/* Enable configurable fault handlers */
|
||
SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk
|
||
| SCB_SHCSR_BUSFAULTENA_Msk
|
||
| SCB_SHCSR_USGFAULTENA_Msk;
|
||
}
|
||
|
||
void fault_handler_init(void)
|
||
{
|
||
/* ---- Maintain lifetime reset counter ---- */
|
||
if (s_reset_count_magic != RESET_COUNT_MAGIC) {
|
||
s_reset_count_magic = RESET_COUNT_MAGIC;
|
||
s_reset_count = 0u;
|
||
}
|
||
s_reset_count++;
|
||
|
||
/* ---- Detect brownout via RCC_CSR ---- */
|
||
bool brownout = (RCC->CSR & RCC_CSR_BORRSTF) != 0u;
|
||
if (brownout) {
|
||
printf("[FAULT] Brownout reset detected (reset_count=%lu)\n",
|
||
(unsigned long)s_reset_count);
|
||
fault_log_entry_t e;
|
||
memset(&e, 0, sizeof(e));
|
||
e.magic = FAULT_LOG_MAGIC;
|
||
e.fault_type = (uint8_t)FAULT_BROWNOUT;
|
||
e.reset_count = (uint8_t)(s_reset_count & 0xFFu);
|
||
_fault_log_write(&e);
|
||
_led_start(FAULT_BROWNOUT);
|
||
}
|
||
|
||
/* ---- Clear all RCC reset source flags ---- */
|
||
RCC->CSR |= RCC_CSR_RMVF;
|
||
|
||
/* ---- Check for pending .noinit fault capture ---- */
|
||
if (s_fault_magic == FAULT_SRAM_MAGIC) {
|
||
s_fault_magic = 0u; /* consume once */
|
||
|
||
fault_log_entry_t e;
|
||
memcpy(&e, (const void *)&s_fault_sram, sizeof(e));
|
||
e.reset_count = (uint8_t)(s_reset_count & 0xFFu);
|
||
|
||
/* Print register dump over CDC/UART */
|
||
printf("[FAULT] *** FAULT RECOVERED ***\n");
|
||
printf("[FAULT] type=%u reset_count=%u ts=%lu ms\n",
|
||
e.fault_type, e.reset_count, (unsigned long)e.timestamp_ms);
|
||
printf("[FAULT] PC=0x%08lX LR=0x%08lX SP=0x%08lX\n",
|
||
(unsigned long)e.pc, (unsigned long)e.lr, (unsigned long)e.sp);
|
||
printf("[FAULT] R0=0x%08lX R1=0x%08lX R2=0x%08lX R3=0x%08lX\n",
|
||
(unsigned long)e.r0, (unsigned long)e.r1,
|
||
(unsigned long)e.r2, (unsigned long)e.r3);
|
||
printf("[FAULT] CFSR=0x%08lX HFSR=0x%08lX MMFAR=0x%08lX BFAR=0x%08lX\n",
|
||
(unsigned long)e.cfsr, (unsigned long)e.hfsr,
|
||
(unsigned long)e.mmfar, (unsigned long)e.bfar);
|
||
|
||
_fault_log_write(&e);
|
||
|
||
FaultType ft = (e.fault_type < (uint8_t)BLINK_TABLE_SIZE)
|
||
? (FaultType)e.fault_type : FAULT_HARDFAULT;
|
||
_led_start(ft);
|
||
}
|
||
|
||
/* ---- Install MPU stack guard & enable fault handlers ---- */
|
||
fault_mpu_guard_init();
|
||
}
|
||
|
||
FaultType fault_get_last_type(void)
|
||
{
|
||
for (int i = (int)FAULT_LOG_MAX_ENTRIES - 1; i >= 0; i--) {
|
||
if (_slot_empty((uint8_t)i)) continue;
|
||
const fault_log_entry_t *e =
|
||
(const fault_log_entry_t *)_slot_addr((uint8_t)i);
|
||
if (e->magic == FAULT_LOG_MAGIC)
|
||
return (FaultType)e->fault_type;
|
||
}
|
||
return FAULT_NONE;
|
||
}
|
||
|
||
bool fault_log_read(uint8_t idx, fault_log_entry_t *out)
|
||
{
|
||
if (idx >= FAULT_LOG_MAX_ENTRIES) return false;
|
||
if (_slot_empty(idx)) return false;
|
||
const fault_log_entry_t *e =
|
||
(const fault_log_entry_t *)_slot_addr(idx);
|
||
if (e->magic != FAULT_LOG_MAGIC) return false;
|
||
memcpy(out, e, sizeof(*out));
|
||
return true;
|
||
}
|
||
|
||
uint8_t fault_log_get_count(void)
|
||
{
|
||
uint8_t n = 0;
|
||
for (uint8_t i = 0; i < FAULT_LOG_MAX_ENTRIES; i++) {
|
||
if (!_slot_empty(i)) n++;
|
||
}
|
||
return n;
|
||
}
|
||
|
||
void fault_log_clear(void)
|
||
{
|
||
float kp, ki, kd;
|
||
bool pid_ok = pid_flash_load(&kp, &ki, &kd);
|
||
|
||
HAL_FLASH_Unlock();
|
||
_erase_sector7();
|
||
HAL_FLASH_Lock();
|
||
|
||
if (pid_ok) {
|
||
pid_flash_save(kp, ki, kd);
|
||
}
|
||
}
|
||
|
||
void fault_assert_impl(const char *file, int line)
|
||
{
|
||
(void)file; (void)line;
|
||
s_fault_sram.magic = FAULT_LOG_MAGIC;
|
||
s_fault_sram.fault_type = (uint8_t)FAULT_ASSERT;
|
||
s_fault_sram.timestamp_ms = HAL_GetTick();
|
||
s_fault_sram.pc = (uint32_t)__builtin_return_address(0);
|
||
s_fault_sram.lr = 0u;
|
||
s_fault_sram.r0 = (uint32_t)(uintptr_t)file;
|
||
s_fault_sram.r1 = (uint32_t)line;
|
||
s_fault_sram.cfsr = SCB->CFSR;
|
||
s_fault_sram.hfsr = 0u;
|
||
s_fault_sram.mmfar = 0u;
|
||
s_fault_sram.bfar = 0u;
|
||
s_fault_sram.sp = 0u;
|
||
s_fault_magic = FAULT_SRAM_MAGIC;
|
||
NVIC_SystemReset();
|
||
}
|
||
|
||
void fault_led_tick(uint32_t now_ms)
|
||
{
|
||
if (s_led_fault == FAULT_NONE) return;
|
||
|
||
/* Auto-disable after 10 s */
|
||
if ((now_ms - s_led_start) > 10000u) {
|
||
s_led_fault = FAULT_NONE;
|
||
HAL_GPIO_WritePin(LED2_PORT, LED2_PIN, GPIO_PIN_SET); /* off */
|
||
return;
|
||
}
|
||
|
||
uint8_t fi = (uint8_t)s_led_fault;
|
||
if (fi >= BLINK_TABLE_SIZE) return;
|
||
|
||
const LedBlink *b = &s_blink_table[fi];
|
||
if ((now_ms - s_led_last) >= b->period_ms) {
|
||
s_led_last = now_ms;
|
||
bool on = ((b->pattern >> (15u - s_led_step)) & 1u) != 0u;
|
||
/* LED2 is active-low (GPIO_PIN_RESET = lit) */
|
||
HAL_GPIO_WritePin(LED2_PORT, LED2_PIN,
|
||
on ? GPIO_PIN_RESET : GPIO_PIN_SET);
|
||
s_led_step = (uint8_t)((s_led_step + 1u) % b->steps);
|
||
}
|
||
}
|
||
|
||
/* ================================================================
|
||
* Fault vector hooks
|
||
* ================================================================
|
||
*
|
||
* Naked entry stubs determine whether the auto-saved stack frame is on
|
||
* MSP or PSP (bit 2 of EXC_RETURN in LR), then tail-call the C handler
|
||
* with the frame pointer in R0.
|
||
*
|
||
* Cortex-M auto-pushed stack frame layout (from [SP]):
|
||
* [0] R0 [1] R1 [2] R2 [3] R3
|
||
* [4] R12 [5] LR [6] PC [7] xPSR
|
||
*/
|
||
|
||
static void _capture_and_reset(FaultType type, uint32_t *frame)
|
||
{
|
||
s_fault_sram.magic = FAULT_LOG_MAGIC;
|
||
s_fault_sram.fault_type = (uint8_t)type;
|
||
s_fault_sram.timestamp_ms = HAL_GetTick();
|
||
s_fault_sram.r0 = frame[0];
|
||
s_fault_sram.r1 = frame[1];
|
||
s_fault_sram.r2 = frame[2];
|
||
s_fault_sram.r3 = frame[3];
|
||
/* frame[4] = R12 (unused in log), frame[5] = LR, frame[6] = PC */
|
||
s_fault_sram.lr = frame[5];
|
||
s_fault_sram.pc = frame[6];
|
||
s_fault_sram.sp = (uint32_t)(uintptr_t)(frame + 8); /* SP after push */
|
||
s_fault_sram.cfsr = SCB->CFSR;
|
||
s_fault_sram.hfsr = SCB->HFSR;
|
||
s_fault_sram.mmfar = SCB->MMFAR;
|
||
s_fault_sram.bfar = SCB->BFAR;
|
||
s_fault_magic = FAULT_SRAM_MAGIC;
|
||
|
||
/* Brief LED flash so a scope can catch it (≈50 ms at 216 MHz) */
|
||
HAL_GPIO_WritePin(LED1_PORT, LED1_PIN, GPIO_PIN_RESET); /* on */
|
||
for (volatile uint32_t i = 0u; i < 10800000u; i++) __NOP();
|
||
|
||
NVIC_SystemReset();
|
||
}
|
||
|
||
/* Determine if a MemManage is from stack overflow vs other memory fault */
|
||
static FaultType _mem_fault_type(void)
|
||
{
|
||
if ((SCB->CFSR & SCB_CFSR_MMARVALID_Msk) != 0u) {
|
||
extern uint32_t __stack_end;
|
||
uint32_t guard = (uint32_t)&__stack_end;
|
||
if (SCB->MMFAR >= guard && SCB->MMFAR < guard + 32u)
|
||
return FAULT_STACK_OVF;
|
||
}
|
||
return FAULT_MEM_FAULT;
|
||
}
|
||
|
||
/* C-level handlers — called from naked asm stubs */
|
||
void fault_hard_c(uint32_t *frame) { _capture_and_reset(FAULT_HARDFAULT, frame); }
|
||
void fault_mem_c(uint32_t *frame) { _capture_and_reset(_mem_fault_type(), frame); }
|
||
void fault_bus_c(uint32_t *frame) { _capture_and_reset(FAULT_BUS_FAULT, frame); }
|
||
void fault_usage_c(uint32_t *frame) { _capture_and_reset(FAULT_USAGE_FAULT, frame); }
|
||
|
||
/* ---- Naked asm entry stubs ---- */
|
||
|
||
__attribute__((naked)) void HardFault_Handler(void)
|
||
{
|
||
__asm volatile (
|
||
"tst lr, #4 \n" /* EXC_RETURN[2]: 0=MSP, 1=PSP */
|
||
"ite eq \n"
|
||
"mrseq r0, msp \n"
|
||
"mrsne r0, psp \n"
|
||
"b fault_hard_c \n"
|
||
);
|
||
}
|
||
|
||
__attribute__((naked)) void MemManage_Handler(void)
|
||
{
|
||
__asm volatile (
|
||
"tst lr, #4 \n"
|
||
"ite eq \n"
|
||
"mrseq r0, msp \n"
|
||
"mrsne r0, psp \n"
|
||
"b fault_mem_c \n"
|
||
);
|
||
}
|
||
|
||
__attribute__((naked)) void BusFault_Handler(void)
|
||
{
|
||
__asm volatile (
|
||
"tst lr, #4 \n"
|
||
"ite eq \n"
|
||
"mrseq r0, msp \n"
|
||
"mrsne r0, psp \n"
|
||
"b fault_bus_c \n"
|
||
);
|
||
}
|
||
|
||
__attribute__((naked)) void UsageFault_Handler(void)
|
||
{
|
||
__asm volatile (
|
||
"tst lr, #4 \n"
|
||
"ite eq \n"
|
||
"mrseq r0, msp \n"
|
||
"mrsne r0, psp \n"
|
||
"b fault_usage_c \n"
|
||
);
|
||
}
|