saltylab-firmware/src/fault_handler.c
sl-firmware 8fbe7c0033 feat: STM32 watchdog and fault recovery handler (Issue #565)
- New src/fault_handler.c + include/fault_handler.h:
  - HardFault/MemManage/BusFault/UsageFault naked ISR stubs with
    Cortex-M7 stack-frame capture (R0-R3, LR, PC, xPSR, CFSR, HFSR,
    MMFAR, BFAR, SP) and NVIC_SystemReset()
  - .noinit SRAM capture ring survives soft reset; persisted to flash
    sector 7 (0x08060000, 8x64-byte slots) on subsequent boot
  - MPU Region 0 stack guard (32 B at __stack_end, no-access) ->
    MemManage fault detected as FAULT_STACK_OVF
  - Brownout detect via RCC_CSR_BORRSTF on boot -> FAULT_BROWNOUT
  - Watchdog reset detection delegates to existing watchdog.c
  - LED blink codes on LED2 (PC14, active-low) for 10 s post-recovery:
    HARDFAULT=3, WATCHDOG=2, BROWNOUT=1, STACK_OVF=4 fast blinks
  - fault_led_tick(), fault_log_read(), fault_log_get_count(),
    fault_get_last_type(), fault_log_clear(), FAULT_ASSERT() macro
- jlink.h: add JLINK_CMD_FAULT_LOG_GET (0x0F), JLINK_TLM_FAULT_LOG
  (0x86), jlink_tlm_fault_log_t (20 bytes), fault_log_req in JLinkState,
  jlink_send_fault_log() declaration
- jlink.c: dispatch JLINK_CMD_FAULT_LOG_GET; implement
  jlink_send_fault_log() (26-byte CRC16-XModem framed response)
- main.c: call fault_handler_init() first in main(); send fault log
  TLM on boot if prior fault recorded; fault_led_tick() in main loop;
  handle fault_log_req flag to respond to Jetson queries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-14 13:37:14 -04:00

458 lines
15 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "fault_handler.h"
#include "config.h"
#include "pid_flash.h"
#include "stm32f7xx_hal.h"
#include <string.h>
#include <stdio.h>
/*
* fault_handler.c — STM32F7 fault detection and recovery (Issue #565)
*
* Recovery flow:
* Fault ISR (naked) → _capture_and_reset() captures registers into .noinit
* SRAM → sets FAULT_SRAM_MAGIC → NVIC_SystemReset().
* On next boot: fault_handler_init() sees FAULT_SRAM_MAGIC → persists to
* flash log → prints CDC dump → starts LED blink code.
*
* No flash writes occur inside fault ISRs. All flash operations happen safely
* in the normal boot context, well before safety_init() / IWDG start.
*/
/* ---- .noinit SRAM (preserved across NVIC_SystemReset) ---- */
/*
* GCC startup code only zeroes .bss and initialises .data. Variables in
* .noinit are left untouched. The magic word guards against cold-boot garbage.
*/
#define FAULT_SRAM_MAGIC 0xFADE5A01u
#define RESET_COUNT_MAGIC 0x1234ABCDu
static __attribute__((section(".noinit"))) volatile uint32_t s_fault_magic;
static __attribute__((section(".noinit"))) volatile fault_log_entry_t s_fault_sram;
static __attribute__((section(".noinit"))) volatile uint32_t s_reset_count_magic;
static __attribute__((section(".noinit"))) volatile uint32_t s_reset_count;
/* ---- LED blink sequencer ---- */
/*
* Each pattern is a 16-bit bitmask; bit 15 = first step.
* One step = period_ms milliseconds. LED2 (PC14) is active-low.
*/
typedef struct {
uint16_t pattern; /* bitmask: 1 = LED on */
uint8_t steps; /* number of valid bits to cycle */
uint16_t period_ms; /* ms per step */
} LedBlink;
/*
* Pattern table indexed by FaultType (0..8).
* NONE = silent
* HARDFAULT = 1010 1010 1010 1010 (3 fast blinks, 100 ms)
* WATCHDOG = 1111 0000 1111 0000 (2 slow pulses, 150 ms × 8 steps = 1.2 s)
* BROWNOUT = 1111 1111 0000 0000 (1 long pulse, 100 ms × 16 = 1.6 s)
* STACK_OVF = 1110 1110 1110 1110 (4 short bursts, 100 ms)
* BUS_FAULT = 1010 1111 1100 0000 (3+1 pattern)
* USAGE_FAULT = 1010 0000 0000 0000 (2 fast blinks)
* MEM_FAULT = 1010 1010 1000 0000 (3 blinks, slower tail)
* ASSERT = 1101 1011 0000 0000 (SOS-like)
*/
static const LedBlink s_blink_table[] = {
/* FAULT_NONE */ { 0x0000u, 16, 100 },
/* FAULT_HARDFAULT */ { 0xAAAAu, 16, 100 },
/* FAULT_WATCHDOG */ { 0xF0F0u, 16, 150 },
/* FAULT_BROWNOUT */ { 0xFF00u, 16, 100 },
/* FAULT_STACK_OVF */ { 0xEEEEu, 16, 100 },
/* FAULT_BUS_FAULT */ { 0xAFC0u, 16, 100 },
/* FAULT_USAGE_FAULT */ { 0xA000u, 16, 100 },
/* FAULT_MEM_FAULT */ { 0xAA80u, 16, 100 },
/* FAULT_ASSERT */ { 0xDB00u, 16, 100 },
};
#define BLINK_TABLE_SIZE (sizeof(s_blink_table) / sizeof(s_blink_table[0]))
static FaultType s_led_fault = FAULT_NONE;
static uint32_t s_led_start = 0;
static uint32_t s_led_last = 0;
static uint8_t s_led_step = 0;
/* ------------------------------------------------------------------ */
/* Flash helpers */
/* ------------------------------------------------------------------ */
static uint32_t _slot_addr(uint8_t idx)
{
return FAULT_LOG_BASE_ADDR + (uint32_t)idx * FAULT_LOG_ENTRY_SIZE;
}
static bool _slot_empty(uint8_t idx)
{
/* An erased 32-bit word reads as 0xFFFFFFFF */
const uint32_t *p = (const uint32_t *)_slot_addr(idx);
return (*p == 0xFFFFFFFFu);
}
static int _free_slot(void)
{
for (uint8_t i = 0; i < FAULT_LOG_MAX_ENTRIES; i++) {
if (_slot_empty(i)) return (int)i;
}
return -1;
}
static bool _erase_sector7(void)
{
FLASH_EraseInitTypeDef er = {0};
er.TypeErase = FLASH_TYPEERASE_SECTORS;
er.Sector = FLASH_SECTOR_7;
er.NbSectors = 1;
er.VoltageRange = FLASH_VOLTAGE_RANGE_3;
uint32_t err = 0;
return HAL_FLASHEx_Erase(&er, &err) == HAL_OK;
}
/*
* Write fault entry to the next free flash slot.
* When all 8 slots are occupied: erase sector 7, restore PID if valid,
* then write entry at slot 0. Sector 7 erase stalls CPU ~1 s — only
* called from fault_handler_init() before IWDG is started.
*/
static bool _fault_log_write(const fault_log_entry_t *entry)
{
int slot = _free_slot();
/* ---- Handle full log: erase sector 7 ---- */
if (slot < 0) {
float kp, ki, kd;
bool pid_ok = pid_flash_load(&kp, &ki, &kd);
HAL_FLASH_Unlock();
bool erased = _erase_sector7();
HAL_FLASH_Lock();
if (!erased) return false;
if (pid_ok) {
/* pid_flash_save() manages its own unlock/lock */
pid_flash_save(kp, ki, kd);
}
slot = 0;
}
/* ---- Write 64 bytes (16 × 32-bit words) to chosen slot ---- */
uint32_t addr = _slot_addr((uint8_t)slot);
const uint32_t *words = (const uint32_t *)entry;
HAL_FLASH_Unlock();
bool ok = true;
for (uint8_t w = 0; w < FAULT_LOG_ENTRY_SIZE / 4u; w++) {
if (HAL_FLASH_Program(FLASH_TYPEPROGRAM_WORD,
addr + (uint32_t)w * 4u, words[w]) != HAL_OK) {
ok = false;
break;
}
}
HAL_FLASH_Lock();
return ok;
}
/* ------------------------------------------------------------------ */
/* LED blink */
/* ------------------------------------------------------------------ */
static void _led_start(FaultType type)
{
s_led_fault = type;
s_led_start = HAL_GetTick();
s_led_last = s_led_start;
s_led_step = 0;
}
/* ------------------------------------------------------------------ */
/* Public API */
/* ------------------------------------------------------------------ */
void fault_mpu_guard_init(void)
{
/*
* Configure MPU Region 0 as a 32-byte no-access guard page at
* __stack_end (lowest address of the main stack). The stack grows
* downward; when it overflows into this region a MemManage fault fires.
*
* MPU RASR SIZE field = log2(region_bytes) - 1 = log2(32) - 1 = 4.
* AP = 0b000 → no access in any mode.
*/
extern uint32_t __stack_end; /* defined in linker script */
HAL_MPU_Disable();
MPU_Region_InitTypeDef r = {0};
r.Enable = MPU_REGION_ENABLE;
r.Number = MPU_REGION_NUMBER0;
r.BaseAddress = (uint32_t)&__stack_end;
r.Size = MPU_REGION_SIZE_32B;
r.SubRegionDisable = 0x00u;
r.TypeExtField = MPU_TEX_LEVEL0;
r.AccessPermission = MPU_REGION_NO_ACCESS;
r.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
r.IsShareable = MPU_ACCESS_NOT_SHAREABLE;
r.IsCacheable = MPU_ACCESS_NOT_CACHEABLE;
r.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE;
HAL_MPU_ConfigRegion(&r);
/* Enable MPU with default memory map for privileged access */
HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT);
/* Enable configurable fault handlers */
SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk
| SCB_SHCSR_BUSFAULTENA_Msk
| SCB_SHCSR_USGFAULTENA_Msk;
}
void fault_handler_init(void)
{
/* ---- Maintain lifetime reset counter ---- */
if (s_reset_count_magic != RESET_COUNT_MAGIC) {
s_reset_count_magic = RESET_COUNT_MAGIC;
s_reset_count = 0u;
}
s_reset_count++;
/* ---- Detect brownout via RCC_CSR ---- */
bool brownout = (RCC->CSR & RCC_CSR_BORRSTF) != 0u;
if (brownout) {
printf("[FAULT] Brownout reset detected (reset_count=%lu)\n",
(unsigned long)s_reset_count);
fault_log_entry_t e;
memset(&e, 0, sizeof(e));
e.magic = FAULT_LOG_MAGIC;
e.fault_type = (uint8_t)FAULT_BROWNOUT;
e.reset_count = (uint8_t)(s_reset_count & 0xFFu);
_fault_log_write(&e);
_led_start(FAULT_BROWNOUT);
}
/* ---- Clear all RCC reset source flags ---- */
RCC->CSR |= RCC_CSR_RMVF;
/* ---- Check for pending .noinit fault capture ---- */
if (s_fault_magic == FAULT_SRAM_MAGIC) {
s_fault_magic = 0u; /* consume once */
fault_log_entry_t e;
memcpy(&e, (const void *)&s_fault_sram, sizeof(e));
e.reset_count = (uint8_t)(s_reset_count & 0xFFu);
/* Print register dump over CDC/UART */
printf("[FAULT] *** FAULT RECOVERED ***\n");
printf("[FAULT] type=%u reset_count=%u ts=%lu ms\n",
e.fault_type, e.reset_count, (unsigned long)e.timestamp_ms);
printf("[FAULT] PC=0x%08lX LR=0x%08lX SP=0x%08lX\n",
(unsigned long)e.pc, (unsigned long)e.lr, (unsigned long)e.sp);
printf("[FAULT] R0=0x%08lX R1=0x%08lX R2=0x%08lX R3=0x%08lX\n",
(unsigned long)e.r0, (unsigned long)e.r1,
(unsigned long)e.r2, (unsigned long)e.r3);
printf("[FAULT] CFSR=0x%08lX HFSR=0x%08lX MMFAR=0x%08lX BFAR=0x%08lX\n",
(unsigned long)e.cfsr, (unsigned long)e.hfsr,
(unsigned long)e.mmfar, (unsigned long)e.bfar);
_fault_log_write(&e);
FaultType ft = (e.fault_type < (uint8_t)BLINK_TABLE_SIZE)
? (FaultType)e.fault_type : FAULT_HARDFAULT;
_led_start(ft);
}
/* ---- Install MPU stack guard & enable fault handlers ---- */
fault_mpu_guard_init();
}
FaultType fault_get_last_type(void)
{
for (int i = (int)FAULT_LOG_MAX_ENTRIES - 1; i >= 0; i--) {
if (_slot_empty((uint8_t)i)) continue;
const fault_log_entry_t *e =
(const fault_log_entry_t *)_slot_addr((uint8_t)i);
if (e->magic == FAULT_LOG_MAGIC)
return (FaultType)e->fault_type;
}
return FAULT_NONE;
}
bool fault_log_read(uint8_t idx, fault_log_entry_t *out)
{
if (idx >= FAULT_LOG_MAX_ENTRIES) return false;
if (_slot_empty(idx)) return false;
const fault_log_entry_t *e =
(const fault_log_entry_t *)_slot_addr(idx);
if (e->magic != FAULT_LOG_MAGIC) return false;
memcpy(out, e, sizeof(*out));
return true;
}
uint8_t fault_log_get_count(void)
{
uint8_t n = 0;
for (uint8_t i = 0; i < FAULT_LOG_MAX_ENTRIES; i++) {
if (!_slot_empty(i)) n++;
}
return n;
}
void fault_log_clear(void)
{
float kp, ki, kd;
bool pid_ok = pid_flash_load(&kp, &ki, &kd);
HAL_FLASH_Unlock();
_erase_sector7();
HAL_FLASH_Lock();
if (pid_ok) {
pid_flash_save(kp, ki, kd);
}
}
void fault_assert_impl(const char *file, int line)
{
(void)file; (void)line;
s_fault_sram.magic = FAULT_LOG_MAGIC;
s_fault_sram.fault_type = (uint8_t)FAULT_ASSERT;
s_fault_sram.timestamp_ms = HAL_GetTick();
s_fault_sram.pc = (uint32_t)__builtin_return_address(0);
s_fault_sram.lr = 0u;
s_fault_sram.r0 = (uint32_t)(uintptr_t)file;
s_fault_sram.r1 = (uint32_t)line;
s_fault_sram.cfsr = SCB->CFSR;
s_fault_sram.hfsr = 0u;
s_fault_sram.mmfar = 0u;
s_fault_sram.bfar = 0u;
s_fault_sram.sp = 0u;
s_fault_magic = FAULT_SRAM_MAGIC;
NVIC_SystemReset();
}
void fault_led_tick(uint32_t now_ms)
{
if (s_led_fault == FAULT_NONE) return;
/* Auto-disable after 10 s */
if ((now_ms - s_led_start) > 10000u) {
s_led_fault = FAULT_NONE;
HAL_GPIO_WritePin(LED2_PORT, LED2_PIN, GPIO_PIN_SET); /* off */
return;
}
uint8_t fi = (uint8_t)s_led_fault;
if (fi >= BLINK_TABLE_SIZE) return;
const LedBlink *b = &s_blink_table[fi];
if ((now_ms - s_led_last) >= b->period_ms) {
s_led_last = now_ms;
bool on = ((b->pattern >> (15u - s_led_step)) & 1u) != 0u;
/* LED2 is active-low (GPIO_PIN_RESET = lit) */
HAL_GPIO_WritePin(LED2_PORT, LED2_PIN,
on ? GPIO_PIN_RESET : GPIO_PIN_SET);
s_led_step = (uint8_t)((s_led_step + 1u) % b->steps);
}
}
/* ================================================================
* Fault vector hooks
* ================================================================
*
* Naked entry stubs determine whether the auto-saved stack frame is on
* MSP or PSP (bit 2 of EXC_RETURN in LR), then tail-call the C handler
* with the frame pointer in R0.
*
* Cortex-M auto-pushed stack frame layout (from [SP]):
* [0] R0 [1] R1 [2] R2 [3] R3
* [4] R12 [5] LR [6] PC [7] xPSR
*/
static void _capture_and_reset(FaultType type, uint32_t *frame)
{
s_fault_sram.magic = FAULT_LOG_MAGIC;
s_fault_sram.fault_type = (uint8_t)type;
s_fault_sram.timestamp_ms = HAL_GetTick();
s_fault_sram.r0 = frame[0];
s_fault_sram.r1 = frame[1];
s_fault_sram.r2 = frame[2];
s_fault_sram.r3 = frame[3];
/* frame[4] = R12 (unused in log), frame[5] = LR, frame[6] = PC */
s_fault_sram.lr = frame[5];
s_fault_sram.pc = frame[6];
s_fault_sram.sp = (uint32_t)(uintptr_t)(frame + 8); /* SP after push */
s_fault_sram.cfsr = SCB->CFSR;
s_fault_sram.hfsr = SCB->HFSR;
s_fault_sram.mmfar = SCB->MMFAR;
s_fault_sram.bfar = SCB->BFAR;
s_fault_magic = FAULT_SRAM_MAGIC;
/* Brief LED flash so a scope can catch it (≈50 ms at 216 MHz) */
HAL_GPIO_WritePin(LED1_PORT, LED1_PIN, GPIO_PIN_RESET); /* on */
for (volatile uint32_t i = 0u; i < 10800000u; i++) __NOP();
NVIC_SystemReset();
}
/* Determine if a MemManage is from stack overflow vs other memory fault */
static FaultType _mem_fault_type(void)
{
if ((SCB->CFSR & SCB_CFSR_MMARVALID_Msk) != 0u) {
extern uint32_t __stack_end;
uint32_t guard = (uint32_t)&__stack_end;
if (SCB->MMFAR >= guard && SCB->MMFAR < guard + 32u)
return FAULT_STACK_OVF;
}
return FAULT_MEM_FAULT;
}
/* C-level handlers — called from naked asm stubs */
void fault_hard_c(uint32_t *frame) { _capture_and_reset(FAULT_HARDFAULT, frame); }
void fault_mem_c(uint32_t *frame) { _capture_and_reset(_mem_fault_type(), frame); }
void fault_bus_c(uint32_t *frame) { _capture_and_reset(FAULT_BUS_FAULT, frame); }
void fault_usage_c(uint32_t *frame) { _capture_and_reset(FAULT_USAGE_FAULT, frame); }
/* ---- Naked asm entry stubs ---- */
__attribute__((naked)) void HardFault_Handler(void)
{
__asm volatile (
"tst lr, #4 \n" /* EXC_RETURN[2]: 0=MSP, 1=PSP */
"ite eq \n"
"mrseq r0, msp \n"
"mrsne r0, psp \n"
"b fault_hard_c \n"
);
}
__attribute__((naked)) void MemManage_Handler(void)
{
__asm volatile (
"tst lr, #4 \n"
"ite eq \n"
"mrseq r0, msp \n"
"mrsne r0, psp \n"
"b fault_mem_c \n"
);
}
__attribute__((naked)) void BusFault_Handler(void)
{
__asm volatile (
"tst lr, #4 \n"
"ite eq \n"
"mrseq r0, msp \n"
"mrsne r0, psp \n"
"b fault_bus_c \n"
);
}
__attribute__((naked)) void UsageFault_Handler(void)
{
__asm volatile (
"tst lr, #4 \n"
"ite eq \n"
"mrseq r0, msp \n"
"mrsne r0, psp \n"
"b fault_usage_c \n"
);
}