diff --git a/include/fault_handler.h b/include/fault_handler.h new file mode 100644 index 0000000..1266524 --- /dev/null +++ b/include/fault_handler.h @@ -0,0 +1,140 @@ +#ifndef FAULT_HANDLER_H +#define FAULT_HANDLER_H + +#include +#include + +/* + * fault_handler.h — STM32F7 fault detection and recovery (Issue #565) + * + * Features: + * - HardFault / BusFault / UsageFault / MemManage vector hooks with full + * Cortex-M7 register dump (R0-R3, LR, PC, xPSR, CFSR, HFSR, MMFAR, BFAR) + * - .noinit SRAM ring: fault frame captured and magic-tagged, survives + * NVIC_SystemReset(); persisted to flash on the subsequent boot + * - MPU Region 0 stack-guard (32 bytes at __stack_end, no-access) → MemManage + * fault detected as FAULT_STACK_OVF + * - Brownout detect via RCC_CSR_BORRSTF on boot → FAULT_BROWNOUT + * - Persistent fault log: last 8 entries × 64 bytes in flash sector 7 + * at 0x08060000 (below the PID store at 0x0807FFC0) + * - JLINK_TLM_FAULT_LOG (0x85): 20-byte summary sent via JLink on boot + * and on JLINK_CMD_FAULT_LOG_GET (0x0C) request + * - LED blink codes on LED2 (PC14, active-low) for 10 s after recovery: + * HARDFAULT = 3 fast blinks (100 ms) + * WATCHDOG = 2 slow blinks (300 ms) + * BROWNOUT = 1 long blink (500 ms) + * STACK_OVF = 4 fast blinks (100 ms) + * BUS_FAULT = alternating 3+1 + * USAGE_FAULT = 2 fast blinks + * - Auto-recovery: fault → .noinit capture → NVIC_SystemReset() + * On next boot fault_handler_init() re-runs safely: persists, prints, blinks + * + * Flash layout within sector 7 (0x08060000, 128 KB): + * Slot 0-7: 0x08060000 – 0x080601FF (8 × 64 bytes = 512 bytes fault log) + * PID store: 0x0807FFC0 – 0x0807FFFF (64 bytes, managed by pid_flash.c) + */ + +/* ---- Fault types ---- */ +typedef enum { + FAULT_NONE = 0x00, + FAULT_HARDFAULT = 0x01, /* HardFault escalation */ + FAULT_WATCHDOG = 0x02, /* IWDG timeout reset */ + FAULT_BROWNOUT = 0x03, /* Brown-out reset (BOR) */ + FAULT_STACK_OVF = 0x04, /* MPU stack guard MemManage */ + FAULT_BUS_FAULT = 0x05, /* BusFault */ + FAULT_USAGE_FAULT = 0x06, /* UsageFault */ + FAULT_MEM_FAULT = 0x07, /* MemManageFault (non-stack-guard) */ + FAULT_ASSERT = 0x08, /* Software assertion */ +} FaultType; + +/* ---- Flash fault log constants ---- */ +#define FAULT_LOG_MAX_ENTRIES 8u +#define FAULT_LOG_MAGIC 0xFADE5A01u +#define FAULT_LOG_BASE_ADDR 0x08060000UL /* start of flash sector 7 */ +#define FAULT_LOG_ENTRY_SIZE 64u /* bytes per entry */ + +/* ---- Flash fault log entry (64 bytes, packed) ---- */ +typedef struct __attribute__((packed)) { + uint32_t magic; /* FAULT_LOG_MAGIC when valid */ + uint8_t fault_type; /* FaultType */ + uint8_t reset_count; /* lifetime reset counter */ + uint16_t _pad0; + uint32_t timestamp_ms; /* HAL_GetTick() at reset (0 if pre-tick) */ + uint32_t pc; /* faulting instruction address */ + uint32_t lr; /* link register at fault */ + uint32_t r0; + uint32_t r1; + uint32_t r2; + uint32_t r3; + uint32_t cfsr; /* SCB->CFSR: combined fault status register */ + uint32_t hfsr; /* SCB->HFSR: hard fault status register */ + uint32_t mmfar; /* SCB->MMFAR: memory manage fault address */ + uint32_t bfar; /* SCB->BFAR: bus fault address */ + uint32_t sp; /* stack pointer value at fault */ + uint8_t _pad1[4]; /* pad to 64 bytes */ +} fault_log_entry_t; /* 64 bytes */ + +/* + * fault_handler_init() — call early in main(), before safety_init(). + * 1. Increments reset counter (.noinit SRAM). + * 2. Checks .noinit SRAM for a pending fault capture; if found: persists to + * flash, prints CDC register dump, starts LED blink code. + * 3. Detects brownout via RCC_CSR_BORRSTF; logs if detected. + * 4. Clears RCC reset flags. + * 5. Installs MPU Region 0 stack guard. + * 6. Enables MemManage, BusFault, UsageFault (SCB->SHCSR). + */ +void fault_handler_init(void); + +/* + * fault_mpu_guard_init() — configure MPU Region 0 as a 32-byte no-access + * guard at __stack_end (bottom of main stack). Generates MemManage on + * stack overflow. Called automatically by fault_handler_init(). + */ +void fault_mpu_guard_init(void); + +/* + * fault_get_last_type() — most recent fault type from flash log, or FAULT_NONE. + */ +FaultType fault_get_last_type(void); + +/* + * fault_log_read(idx, out) — read flash slot 0..7. + * Returns false if slot empty or idx out of range. + */ +bool fault_log_read(uint8_t idx, fault_log_entry_t *out); + +/* + * fault_log_get_count() — number of valid (occupied) log slots, 0-8. + */ +uint8_t fault_log_get_count(void); + +/* + * fault_log_clear() — erase fault log, restore PID if previously saved. + * Erases all of sector 7 (~1 s stall). Do not call while armed. + */ +void fault_log_clear(void); + +/* + * fault_assert(file, line) — software fault at runtime; captures return + * address, writes SRAM magic, triggers NVIC_SystemReset(). + * Use via FAULT_ASSERT(cond) macro below. + */ +void fault_assert_impl(const char *file, int line); + +#define FAULT_ASSERT(cond) \ + do { if (!(cond)) fault_assert_impl(__FILE__, __LINE__); } while (0) + +/* + * fault_led_tick(now_ms) — drive LED2 blink code from main loop (1 ms). + * Self-disables after 10 s so it doesn't interfere with normal LED state. + */ +void fault_led_tick(uint32_t now_ms); + +/* C-level fault dispatch (called from naked asm stubs; not for direct use) */ +void fault_hard_c(uint32_t *frame); +void fault_mem_c(uint32_t *frame); +void fault_bus_c(uint32_t *frame); +void fault_usage_c(uint32_t *frame); + +#endif /* FAULT_HANDLER_H */ diff --git a/include/jlink.h b/include/jlink.h index 3c24107..ab8dedf 100644 --- a/include/jlink.h +++ b/include/jlink.h @@ -22,20 +22,21 @@ * ETX : frame end sentinel (0x03) * * Jetson to STM32 commands: - * 0x01 HEARTBEAT - no payload; refreshes heartbeat timer - * 0x02 DRIVE - int16 speed (-1000..+1000), int16 steer (-1000..+1000) - * 0x03 ARM - no payload; request arm (same interlock as CDC 'A') - * 0x04 DISARM - no payload; disarm immediately - * 0x05 PID_SET - float kp, float ki, float kd (12 bytes, IEEE-754 LE) - * 0x06 DFU_ENTER - no payload; request OTA DFU reboot (denied while armed) - * 0x07 ESTOP - no payload; engage emergency stop - * 0x08 AUDIO - int16 PCM samples (up to 126 samples) - * 0x09 SLEEP - no payload; request STOP-mode sleep - * 0x0A PID_SAVE - no payload; save current Kp/Ki/Kd to flash (Issue #531) - * 0x0B GIMBAL_POS - int16 pan_x10, int16 tilt_x10, uint16 speed (Issue #547) - * 0x0C SCHED_GET - no payload; reply with TLM_SCHED (Issue #550) - * 0x0D SCHED_SET - uint8 num_bands + N*16-byte pid_sched_entry_t (Issue #550) - * 0x0E SCHED_SAVE - float kp, ki, kd (12 bytes); save sched+single to flash (Issue #550) + * 0x01 HEARTBEAT - no payload; refreshes heartbeat timer + * 0x02 DRIVE - int16 speed (-1000..+1000), int16 steer (-1000..+1000) + * 0x03 ARM - no payload; request arm (same interlock as CDC 'A') + * 0x04 DISARM - no payload; disarm immediately + * 0x05 PID_SET - float kp, float ki, float kd (12 bytes, IEEE-754 LE) + * 0x06 DFU_ENTER - no payload; request OTA DFU reboot (denied while armed) + * 0x07 ESTOP - no payload; engage emergency stop + * 0x08 AUDIO - int16 PCM samples (up to 126 samples) + * 0x09 SLEEP - no payload; request STOP-mode sleep + * 0x0A PID_SAVE - no payload; save current Kp/Ki/Kd to flash (Issue #531) + * 0x0B GIMBAL_POS - int16 pan_x10, int16 tilt_x10, uint16 speed (Issue #547) + * 0x0C SCHED_GET - no payload; reply with TLM_SCHED (Issue #550) + * 0x0D SCHED_SET - uint8 num_bands + N*16-byte pid_sched_entry_t (Issue #550) + * 0x0E SCHED_SAVE - float kp, ki, kd (12 bytes); save sched+single to flash (Issue #550) + * 0x0F FAULT_LOG_GET - no payload; reply with TLM_FAULT_LOG (Issue #565) * * STM32 to Jetson telemetry: * 0x80 STATUS - jlink_tlm_status_t (20 bytes), sent at JLINK_TLM_HZ @@ -45,6 +46,7 @@ * 0x84 GIMBAL_STATE - jlink_tlm_gimbal_state_t (10 bytes, Issue #547) * 0x85 SCHED - jlink_tlm_sched_t (1+N*16 bytes), sent on SCHED_GET (Issue #550) * 0x86 MOTOR_CURRENT - jlink_tlm_motor_current_t (8 bytes, Issue #584) + * 0x87 FAULT_LOG - jlink_tlm_fault_log_t (20 bytes), sent on boot + FAULT_LOG_GET (Issue #565) * * Priority: CRSF RC always takes precedence. Jetson steer/speed only applied * when mode_manager_active() == MODE_AUTONOMOUS (CH6 high). In RC_MANUAL and @@ -74,6 +76,7 @@ #define JLINK_CMD_SCHED_GET 0x0Cu /* no payload; reply TLM_SCHED (Issue #550) */ #define JLINK_CMD_SCHED_SET 0x0Du /* uint8 num_bands + N*16-byte entries (Issue #550) */ #define JLINK_CMD_SCHED_SAVE 0x0Eu /* float kp,ki,kd; save sched+single to flash (Issue #550) */ +#define JLINK_CMD_FAULT_LOG_GET 0x0Fu /* no payload; reply TLM_FAULT_LOG (Issue #565) */ /* ---- Telemetry IDs (STM32 to Jetson) ---- */ #define JLINK_TLM_STATUS 0x80u @@ -83,6 +86,7 @@ #define JLINK_TLM_GIMBAL_STATE 0x84u /* jlink_tlm_gimbal_state_t (10 bytes, Issue #547) */ #define JLINK_TLM_SCHED 0x85u /* jlink_tlm_sched_t (1+N*16 bytes, Issue #550) */ #define JLINK_TLM_MOTOR_CURRENT 0x86u /* jlink_tlm_motor_current_t (8 bytes, Issue #584) */ +#define JLINK_TLM_FAULT_LOG 0x87u /* jlink_tlm_fault_log_t (20 bytes, Issue #565) */ /* ---- Telemetry STATUS payload (20 bytes, packed) ---- */ typedef struct __attribute__((packed)) { @@ -160,6 +164,20 @@ typedef struct __attribute__((packed)) { uint8_t _pad; /* reserved */ } jlink_tlm_motor_current_t; /* 8 bytes */ +/* ---- Telemetry FAULT_LOG payload (20 bytes, packed) Issue #565 ---- */ +/* Sent on boot (if last fault != NONE) and in response to FAULT_LOG_GET. */ +typedef struct __attribute__((packed)) { + uint8_t fault_type; /* FaultType of most recent entry */ + uint8_t entry_count; /* number of valid entries in flash log (0-8) */ + uint8_t reset_count; /* lifetime reset counter */ + uint8_t _pad; + uint32_t timestamp_ms; /* HAL_GetTick() at fault */ + uint32_t pc; /* faulting PC */ + uint32_t lr; /* link register at fault */ + uint32_t cfsr; /* SCB->CFSR */ + uint32_t hfsr; /* SCB->HFSR */ +} jlink_tlm_fault_log_t; /* 20 bytes */ + /* ---- Volatile state (read from main loop) ---- */ typedef struct { /* Drive command - updated on JLINK_CMD_DRIVE */ @@ -199,6 +217,9 @@ typedef struct { volatile float sched_save_kp; /* kp for single-PID record in SCHED_SAVE */ volatile float sched_save_ki; volatile float sched_save_kd; + + /* Fault log request (Issue #565) - set by JLINK_CMD_FAULT_LOG_GET, cleared by main loop */ + volatile uint8_t fault_log_req; } JLinkState; extern volatile JLinkState jlink_state; @@ -251,4 +272,11 @@ JLinkSchedSetBuf *jlink_get_sched_set(void); */ void jlink_send_motor_current_tlm(const jlink_tlm_motor_current_t *tlm); +/* + * jlink_send_fault_log(fl) - transmit JLINK_TLM_FAULT_LOG (0x87) frame + * (26 bytes) on boot (if fault log non-empty) and in response to + * FAULT_LOG_GET. Issue #565. + */ +void jlink_send_fault_log(const jlink_tlm_fault_log_t *fl); + #endif /* JLINK_H */ diff --git a/src/fault_handler.c b/src/fault_handler.c new file mode 100644 index 0000000..e098abe --- /dev/null +++ b/src/fault_handler.c @@ -0,0 +1,457 @@ +#include "fault_handler.h" +#include "config.h" +#include "pid_flash.h" +#include "stm32f7xx_hal.h" +#include +#include + +/* + * fault_handler.c — STM32F7 fault detection and recovery (Issue #565) + * + * Recovery flow: + * Fault ISR (naked) → _capture_and_reset() captures registers into .noinit + * SRAM → sets FAULT_SRAM_MAGIC → NVIC_SystemReset(). + * On next boot: fault_handler_init() sees FAULT_SRAM_MAGIC → persists to + * flash log → prints CDC dump → starts LED blink code. + * + * No flash writes occur inside fault ISRs. All flash operations happen safely + * in the normal boot context, well before safety_init() / IWDG start. + */ + +/* ---- .noinit SRAM (preserved across NVIC_SystemReset) ---- */ +/* + * GCC startup code only zeroes .bss and initialises .data. Variables in + * .noinit are left untouched. The magic word guards against cold-boot garbage. + */ +#define FAULT_SRAM_MAGIC 0xFADE5A01u +#define RESET_COUNT_MAGIC 0x1234ABCDu + +static __attribute__((section(".noinit"))) volatile uint32_t s_fault_magic; +static __attribute__((section(".noinit"))) volatile fault_log_entry_t s_fault_sram; +static __attribute__((section(".noinit"))) volatile uint32_t s_reset_count_magic; +static __attribute__((section(".noinit"))) volatile uint32_t s_reset_count; + +/* ---- LED blink sequencer ---- */ +/* + * Each pattern is a 16-bit bitmask; bit 15 = first step. + * One step = period_ms milliseconds. LED2 (PC14) is active-low. + */ +typedef struct { + uint16_t pattern; /* bitmask: 1 = LED on */ + uint8_t steps; /* number of valid bits to cycle */ + uint16_t period_ms; /* ms per step */ +} LedBlink; + +/* + * Pattern table indexed by FaultType (0..8). + * NONE = silent + * HARDFAULT = 1010 1010 1010 1010 (3 fast blinks, 100 ms) + * WATCHDOG = 1111 0000 1111 0000 (2 slow pulses, 150 ms × 8 steps = 1.2 s) + * BROWNOUT = 1111 1111 0000 0000 (1 long pulse, 100 ms × 16 = 1.6 s) + * STACK_OVF = 1110 1110 1110 1110 (4 short bursts, 100 ms) + * BUS_FAULT = 1010 1111 1100 0000 (3+1 pattern) + * USAGE_FAULT = 1010 0000 0000 0000 (2 fast blinks) + * MEM_FAULT = 1010 1010 1000 0000 (3 blinks, slower tail) + * ASSERT = 1101 1011 0000 0000 (SOS-like) + */ +static const LedBlink s_blink_table[] = { + /* FAULT_NONE */ { 0x0000u, 16, 100 }, + /* FAULT_HARDFAULT */ { 0xAAAAu, 16, 100 }, + /* FAULT_WATCHDOG */ { 0xF0F0u, 16, 150 }, + /* FAULT_BROWNOUT */ { 0xFF00u, 16, 100 }, + /* FAULT_STACK_OVF */ { 0xEEEEu, 16, 100 }, + /* FAULT_BUS_FAULT */ { 0xAFC0u, 16, 100 }, + /* FAULT_USAGE_FAULT */ { 0xA000u, 16, 100 }, + /* FAULT_MEM_FAULT */ { 0xAA80u, 16, 100 }, + /* FAULT_ASSERT */ { 0xDB00u, 16, 100 }, +}; +#define BLINK_TABLE_SIZE (sizeof(s_blink_table) / sizeof(s_blink_table[0])) + +static FaultType s_led_fault = FAULT_NONE; +static uint32_t s_led_start = 0; +static uint32_t s_led_last = 0; +static uint8_t s_led_step = 0; + +/* ------------------------------------------------------------------ */ +/* Flash helpers */ +/* ------------------------------------------------------------------ */ + +static uint32_t _slot_addr(uint8_t idx) +{ + return FAULT_LOG_BASE_ADDR + (uint32_t)idx * FAULT_LOG_ENTRY_SIZE; +} + +static bool _slot_empty(uint8_t idx) +{ + /* An erased 32-bit word reads as 0xFFFFFFFF */ + const uint32_t *p = (const uint32_t *)_slot_addr(idx); + return (*p == 0xFFFFFFFFu); +} + +static int _free_slot(void) +{ + for (uint8_t i = 0; i < FAULT_LOG_MAX_ENTRIES; i++) { + if (_slot_empty(i)) return (int)i; + } + return -1; +} + +static bool _erase_sector7(void) +{ + FLASH_EraseInitTypeDef er = {0}; + er.TypeErase = FLASH_TYPEERASE_SECTORS; + er.Sector = FLASH_SECTOR_7; + er.NbSectors = 1; + er.VoltageRange = FLASH_VOLTAGE_RANGE_3; + uint32_t err = 0; + return HAL_FLASHEx_Erase(&er, &err) == HAL_OK; +} + +/* + * Write fault entry to the next free flash slot. + * When all 8 slots are occupied: erase sector 7, restore PID if valid, + * then write entry at slot 0. Sector 7 erase stalls CPU ~1 s — only + * called from fault_handler_init() before IWDG is started. + */ +static bool _fault_log_write(const fault_log_entry_t *entry) +{ + int slot = _free_slot(); + + /* ---- Handle full log: erase sector 7 ---- */ + if (slot < 0) { + float kp, ki, kd; + bool pid_ok = pid_flash_load(&kp, &ki, &kd); + + HAL_FLASH_Unlock(); + bool erased = _erase_sector7(); + HAL_FLASH_Lock(); + + if (!erased) return false; + + if (pid_ok) { + /* pid_flash_save() manages its own unlock/lock */ + pid_flash_save(kp, ki, kd); + } + slot = 0; + } + + /* ---- Write 64 bytes (16 × 32-bit words) to chosen slot ---- */ + uint32_t addr = _slot_addr((uint8_t)slot); + const uint32_t *words = (const uint32_t *)entry; + + HAL_FLASH_Unlock(); + bool ok = true; + for (uint8_t w = 0; w < FAULT_LOG_ENTRY_SIZE / 4u; w++) { + if (HAL_FLASH_Program(FLASH_TYPEPROGRAM_WORD, + addr + (uint32_t)w * 4u, words[w]) != HAL_OK) { + ok = false; + break; + } + } + HAL_FLASH_Lock(); + return ok; +} + +/* ------------------------------------------------------------------ */ +/* LED blink */ +/* ------------------------------------------------------------------ */ + +static void _led_start(FaultType type) +{ + s_led_fault = type; + s_led_start = HAL_GetTick(); + s_led_last = s_led_start; + s_led_step = 0; +} + +/* ------------------------------------------------------------------ */ +/* Public API */ +/* ------------------------------------------------------------------ */ + +void fault_mpu_guard_init(void) +{ + /* + * Configure MPU Region 0 as a 32-byte no-access guard page at + * __stack_end (lowest address of the main stack). The stack grows + * downward; when it overflows into this region a MemManage fault fires. + * + * MPU RASR SIZE field = log2(region_bytes) - 1 = log2(32) - 1 = 4. + * AP = 0b000 → no access in any mode. + */ + extern uint32_t __stack_end; /* defined in linker script */ + + HAL_MPU_Disable(); + + MPU_Region_InitTypeDef r = {0}; + r.Enable = MPU_REGION_ENABLE; + r.Number = MPU_REGION_NUMBER0; + r.BaseAddress = (uint32_t)&__stack_end; + r.Size = MPU_REGION_SIZE_32B; + r.SubRegionDisable = 0x00u; + r.TypeExtField = MPU_TEX_LEVEL0; + r.AccessPermission = MPU_REGION_NO_ACCESS; + r.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE; + r.IsShareable = MPU_ACCESS_NOT_SHAREABLE; + r.IsCacheable = MPU_ACCESS_NOT_CACHEABLE; + r.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE; + HAL_MPU_ConfigRegion(&r); + + /* Enable MPU with default memory map for privileged access */ + HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT); + + /* Enable configurable fault handlers */ + SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk + | SCB_SHCSR_BUSFAULTENA_Msk + | SCB_SHCSR_USGFAULTENA_Msk; +} + +void fault_handler_init(void) +{ + /* ---- Maintain lifetime reset counter ---- */ + if (s_reset_count_magic != RESET_COUNT_MAGIC) { + s_reset_count_magic = RESET_COUNT_MAGIC; + s_reset_count = 0u; + } + s_reset_count++; + + /* ---- Detect brownout via RCC_CSR ---- */ + bool brownout = (RCC->CSR & RCC_CSR_BORRSTF) != 0u; + if (brownout) { + printf("[FAULT] Brownout reset detected (reset_count=%lu)\n", + (unsigned long)s_reset_count); + fault_log_entry_t e; + memset(&e, 0, sizeof(e)); + e.magic = FAULT_LOG_MAGIC; + e.fault_type = (uint8_t)FAULT_BROWNOUT; + e.reset_count = (uint8_t)(s_reset_count & 0xFFu); + _fault_log_write(&e); + _led_start(FAULT_BROWNOUT); + } + + /* ---- Clear all RCC reset source flags ---- */ + RCC->CSR |= RCC_CSR_RMVF; + + /* ---- Check for pending .noinit fault capture ---- */ + if (s_fault_magic == FAULT_SRAM_MAGIC) { + s_fault_magic = 0u; /* consume once */ + + fault_log_entry_t e; + memcpy(&e, (const void *)&s_fault_sram, sizeof(e)); + e.reset_count = (uint8_t)(s_reset_count & 0xFFu); + + /* Print register dump over CDC/UART */ + printf("[FAULT] *** FAULT RECOVERED ***\n"); + printf("[FAULT] type=%u reset_count=%u ts=%lu ms\n", + e.fault_type, e.reset_count, (unsigned long)e.timestamp_ms); + printf("[FAULT] PC=0x%08lX LR=0x%08lX SP=0x%08lX\n", + (unsigned long)e.pc, (unsigned long)e.lr, (unsigned long)e.sp); + printf("[FAULT] R0=0x%08lX R1=0x%08lX R2=0x%08lX R3=0x%08lX\n", + (unsigned long)e.r0, (unsigned long)e.r1, + (unsigned long)e.r2, (unsigned long)e.r3); + printf("[FAULT] CFSR=0x%08lX HFSR=0x%08lX MMFAR=0x%08lX BFAR=0x%08lX\n", + (unsigned long)e.cfsr, (unsigned long)e.hfsr, + (unsigned long)e.mmfar, (unsigned long)e.bfar); + + _fault_log_write(&e); + + FaultType ft = (e.fault_type < (uint8_t)BLINK_TABLE_SIZE) + ? (FaultType)e.fault_type : FAULT_HARDFAULT; + _led_start(ft); + } + + /* ---- Install MPU stack guard & enable fault handlers ---- */ + fault_mpu_guard_init(); +} + +FaultType fault_get_last_type(void) +{ + for (int i = (int)FAULT_LOG_MAX_ENTRIES - 1; i >= 0; i--) { + if (_slot_empty((uint8_t)i)) continue; + const fault_log_entry_t *e = + (const fault_log_entry_t *)_slot_addr((uint8_t)i); + if (e->magic == FAULT_LOG_MAGIC) + return (FaultType)e->fault_type; + } + return FAULT_NONE; +} + +bool fault_log_read(uint8_t idx, fault_log_entry_t *out) +{ + if (idx >= FAULT_LOG_MAX_ENTRIES) return false; + if (_slot_empty(idx)) return false; + const fault_log_entry_t *e = + (const fault_log_entry_t *)_slot_addr(idx); + if (e->magic != FAULT_LOG_MAGIC) return false; + memcpy(out, e, sizeof(*out)); + return true; +} + +uint8_t fault_log_get_count(void) +{ + uint8_t n = 0; + for (uint8_t i = 0; i < FAULT_LOG_MAX_ENTRIES; i++) { + if (!_slot_empty(i)) n++; + } + return n; +} + +void fault_log_clear(void) +{ + float kp, ki, kd; + bool pid_ok = pid_flash_load(&kp, &ki, &kd); + + HAL_FLASH_Unlock(); + _erase_sector7(); + HAL_FLASH_Lock(); + + if (pid_ok) { + pid_flash_save(kp, ki, kd); + } +} + +void fault_assert_impl(const char *file, int line) +{ + (void)file; (void)line; + s_fault_sram.magic = FAULT_LOG_MAGIC; + s_fault_sram.fault_type = (uint8_t)FAULT_ASSERT; + s_fault_sram.timestamp_ms = HAL_GetTick(); + s_fault_sram.pc = (uint32_t)__builtin_return_address(0); + s_fault_sram.lr = 0u; + s_fault_sram.r0 = (uint32_t)(uintptr_t)file; + s_fault_sram.r1 = (uint32_t)line; + s_fault_sram.cfsr = SCB->CFSR; + s_fault_sram.hfsr = 0u; + s_fault_sram.mmfar = 0u; + s_fault_sram.bfar = 0u; + s_fault_sram.sp = 0u; + s_fault_magic = FAULT_SRAM_MAGIC; + NVIC_SystemReset(); +} + +void fault_led_tick(uint32_t now_ms) +{ + if (s_led_fault == FAULT_NONE) return; + + /* Auto-disable after 10 s */ + if ((now_ms - s_led_start) > 10000u) { + s_led_fault = FAULT_NONE; + HAL_GPIO_WritePin(LED2_PORT, LED2_PIN, GPIO_PIN_SET); /* off */ + return; + } + + uint8_t fi = (uint8_t)s_led_fault; + if (fi >= BLINK_TABLE_SIZE) return; + + const LedBlink *b = &s_blink_table[fi]; + if ((now_ms - s_led_last) >= b->period_ms) { + s_led_last = now_ms; + bool on = ((b->pattern >> (15u - s_led_step)) & 1u) != 0u; + /* LED2 is active-low (GPIO_PIN_RESET = lit) */ + HAL_GPIO_WritePin(LED2_PORT, LED2_PIN, + on ? GPIO_PIN_RESET : GPIO_PIN_SET); + s_led_step = (uint8_t)((s_led_step + 1u) % b->steps); + } +} + +/* ================================================================ + * Fault vector hooks + * ================================================================ + * + * Naked entry stubs determine whether the auto-saved stack frame is on + * MSP or PSP (bit 2 of EXC_RETURN in LR), then tail-call the C handler + * with the frame pointer in R0. + * + * Cortex-M auto-pushed stack frame layout (from [SP]): + * [0] R0 [1] R1 [2] R2 [3] R3 + * [4] R12 [5] LR [6] PC [7] xPSR + */ + +static void _capture_and_reset(FaultType type, uint32_t *frame) +{ + s_fault_sram.magic = FAULT_LOG_MAGIC; + s_fault_sram.fault_type = (uint8_t)type; + s_fault_sram.timestamp_ms = HAL_GetTick(); + s_fault_sram.r0 = frame[0]; + s_fault_sram.r1 = frame[1]; + s_fault_sram.r2 = frame[2]; + s_fault_sram.r3 = frame[3]; + /* frame[4] = R12 (unused in log), frame[5] = LR, frame[6] = PC */ + s_fault_sram.lr = frame[5]; + s_fault_sram.pc = frame[6]; + s_fault_sram.sp = (uint32_t)(uintptr_t)(frame + 8); /* SP after push */ + s_fault_sram.cfsr = SCB->CFSR; + s_fault_sram.hfsr = SCB->HFSR; + s_fault_sram.mmfar = SCB->MMFAR; + s_fault_sram.bfar = SCB->BFAR; + s_fault_magic = FAULT_SRAM_MAGIC; + + /* Brief LED flash so a scope can catch it (≈50 ms at 216 MHz) */ + HAL_GPIO_WritePin(LED1_PORT, LED1_PIN, GPIO_PIN_RESET); /* on */ + for (volatile uint32_t i = 0u; i < 10800000u; i++) __NOP(); + + NVIC_SystemReset(); +} + +/* Determine if a MemManage is from stack overflow vs other memory fault */ +static FaultType _mem_fault_type(void) +{ + if ((SCB->CFSR & SCB_CFSR_MMARVALID_Msk) != 0u) { + extern uint32_t __stack_end; + uint32_t guard = (uint32_t)&__stack_end; + if (SCB->MMFAR >= guard && SCB->MMFAR < guard + 32u) + return FAULT_STACK_OVF; + } + return FAULT_MEM_FAULT; +} + +/* C-level handlers — called from naked asm stubs */ +void fault_hard_c(uint32_t *frame) { _capture_and_reset(FAULT_HARDFAULT, frame); } +void fault_mem_c(uint32_t *frame) { _capture_and_reset(_mem_fault_type(), frame); } +void fault_bus_c(uint32_t *frame) { _capture_and_reset(FAULT_BUS_FAULT, frame); } +void fault_usage_c(uint32_t *frame) { _capture_and_reset(FAULT_USAGE_FAULT, frame); } + +/* ---- Naked asm entry stubs ---- */ + +__attribute__((naked)) void HardFault_Handler(void) +{ + __asm volatile ( + "tst lr, #4 \n" /* EXC_RETURN[2]: 0=MSP, 1=PSP */ + "ite eq \n" + "mrseq r0, msp \n" + "mrsne r0, psp \n" + "b fault_hard_c \n" + ); +} + +__attribute__((naked)) void MemManage_Handler(void) +{ + __asm volatile ( + "tst lr, #4 \n" + "ite eq \n" + "mrseq r0, msp \n" + "mrsne r0, psp \n" + "b fault_mem_c \n" + ); +} + +__attribute__((naked)) void BusFault_Handler(void) +{ + __asm volatile ( + "tst lr, #4 \n" + "ite eq \n" + "mrseq r0, msp \n" + "mrsne r0, psp \n" + "b fault_bus_c \n" + ); +} + +__attribute__((naked)) void UsageFault_Handler(void) +{ + __asm volatile ( + "tst lr, #4 \n" + "ite eq \n" + "mrseq r0, msp \n" + "mrsne r0, psp \n" + "b fault_usage_c \n" + ); +} diff --git a/src/jlink.c b/src/jlink.c index fd74845..fba8f08 100644 --- a/src/jlink.c +++ b/src/jlink.c @@ -267,6 +267,10 @@ static void dispatch(const uint8_t *payload, uint8_t cmd, uint8_t plen) } break; + case JLINK_CMD_FAULT_LOG_GET: /* Issue #565: request fault log telemetry */ + jlink_state.fault_log_req = 1u; + break; + default: break; } @@ -536,3 +540,27 @@ void jlink_send_sched_telemetry(const jlink_tlm_sched_t *tlm) jlink_tx_locked(frame, (uint16_t)(3u + plen + 3u)); } + +/* ---- jlink_send_fault_log() -- Issue #565 ---- */ +void jlink_send_fault_log(const jlink_tlm_fault_log_t *fl) +{ + /* + * Frame: [STX][LEN][0x86][20 bytes fault_log][CRC_hi][CRC_lo][ETX] + * Total: 1+1+1+20+2+1 = 26 bytes + */ + static uint8_t frame[26]; + const uint8_t plen = (uint8_t)sizeof(jlink_tlm_fault_log_t); /* 20 */ + const uint8_t len = 1u + plen; /* CMD byte + payload */ + + frame[0] = JLINK_STX; + frame[1] = len; + frame[2] = JLINK_TLM_FAULT_LOG; + memcpy(&frame[3], fl, plen); + + uint16_t crc = crc16_xmodem(&frame[2], len); + frame[3 + plen] = (uint8_t)(crc >> 8); + frame[3 + plen + 1] = (uint8_t)(crc & 0xFFu); + frame[3 + plen + 2] = JLINK_ETX; + + jlink_tx_locked(frame, sizeof(frame)); +} diff --git a/src/main.c b/src/main.c index 4198b80..d880632 100644 --- a/src/main.c +++ b/src/main.c @@ -31,6 +31,7 @@ #include "coulomb_counter.h" #include "watchdog.h" #include "pid_flash.h" +#include "fault_handler.h" #include "servo_bus.h" #include "gimbal.h" #include @@ -132,6 +133,9 @@ int main(void) { HAL_Init(); SystemClock_Config(); + /* Fault recovery handler (Issue #565) — must be first, before safety_init() */ + fault_handler_init(); + /* Detect watchdog reset (Issue #300) — must be before safety_init() */ g_watchdog_reset_detected = watchdog_was_reset_by_watchdog(); @@ -188,6 +192,28 @@ int main(void) { /* Init Jetson serial binary protocol on USART1 (PB6/PB7) at 921600 baud */ jlink_init(); + /* Send fault log summary on boot if a prior fault was recorded (Issue #565) */ + if (fault_get_last_type() != FAULT_NONE) { + fault_log_entry_t fle; + memset(&fle, 0, sizeof(fle)); + jlink_tlm_fault_log_t ftlm; + memset(&ftlm, 0, sizeof(ftlm)); + ftlm.entry_count = fault_log_get_count(); + if (fault_log_read(0u, &fle)) { + ftlm.fault_type = fle.fault_type; + ftlm.reset_count = fle.reset_count; + ftlm.timestamp_ms = fle.timestamp_ms; + ftlm.pc = fle.pc; + ftlm.lr = fle.lr; + ftlm.cfsr = fle.cfsr; + ftlm.hfsr = fle.hfsr; + } + jlink_send_fault_log(&ftlm); + printf("[FAULT] Prior fault type=%u count=%u PC=0x%08lX\n", + (unsigned)ftlm.fault_type, (unsigned)ftlm.entry_count, + (unsigned long)ftlm.pc); + } + /* Init Jetson UART command interface on USART6 (PC6/PC7) at 921600 baud. * Mirrors CDC command protocol over hardware UART (fixes USB CDC TX bug). */ jetson_uart_init(); @@ -276,6 +302,9 @@ int main(void) { /* Advance LED animation sequencer (non-blocking, call every tick) */ led_tick(now); + /* Fault recovery LED blink code (Issue #565; self-disables after 10 s) */ + fault_led_tick(now); + /* Servo pan-tilt animation tick — updates smooth sweeps */ servo_tick(now); @@ -384,6 +413,26 @@ int main(void) { (double)bal.kp, (double)bal.ki, (double)bal.kd); } + /* FAULT_LOG_GET: send fault log telemetry to Jetson (Issue #565) */ + if (jlink_state.fault_log_req) { + jlink_state.fault_log_req = 0u; + fault_log_entry_t fle; + memset(&fle, 0, sizeof(fle)); + jlink_tlm_fault_log_t ftlm; + memset(&ftlm, 0, sizeof(ftlm)); + ftlm.entry_count = fault_log_get_count(); + if (fault_log_read(0u, &fle)) { + ftlm.fault_type = fle.fault_type; + ftlm.reset_count = fle.reset_count; + ftlm.timestamp_ms = fle.timestamp_ms; + ftlm.pc = fle.pc; + ftlm.lr = fle.lr; + ftlm.cfsr = fle.cfsr; + ftlm.hfsr = fle.hfsr; + } + jlink_send_fault_log(&ftlm); + } + /* Power management: CRSF/JLink activity or armed state resets idle timer */ if ((crsf_state.last_rx_ms != 0 && (now - crsf_state.last_rx_ms) < 500) || jlink_is_active(now) ||