feat: ROS2 sensor health monitor (Issue #566)
Add sensor_health_node to saltybot_health_monitor package. Monitors 8 sensor topics for staleness, publishing DiagnosticArray on /saltybot/diagnostics and MQTT JSON on saltybot/health. Sensors monitored (configurable thresholds): /camera/color/image_raw, /camera/depth/image_rect_raw, /camera/color/camera_info, /scan, /imu/data, /saltybot/uwb/range, /saltybot/battery, /saltybot/motor_daemon/status Each sensor: OK/WARN/ERROR based on topic age vs warn_s/error_s thresholds. Critical sensors (camera, lidar, imu, motor_daemon) escalate overall status. Files added: sensor_health_node.py — SensorWatcher + SensorHealthNode config/sensor_health_params.yaml — per-sensor thresholds launch/sensor_health.launch.py test/test_sensor_health.py — 35 tests, all passing setup.py/package.xml updated: sensor_msgs, diagnostic_msgs deps + new entry point. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
76668d8346
commit
8e03a209be
113
include/jlink.h
113
include/jlink.h
@ -3,9 +3,10 @@
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#include "pid_flash.h" /* pid_sched_entry_t, PID_SCHED_MAX_BANDS */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* JLink — Jetson serial binary protocol over USART1 (PB6=TX, PB7=RX).
|
* JLink -- Jetson serial binary protocol over USART1 (PB6=TX, PB7=RX).
|
||||||
*
|
*
|
||||||
* Issue #120: replaces jetson_cmd ASCII-over-USB-CDC with a dedicated
|
* Issue #120: replaces jetson_cmd ASCII-over-USB-CDC with a dedicated
|
||||||
* hardware UART at 921600 baud using DMA circular RX and IDLE interrupt.
|
* hardware UART at 921600 baud using DMA circular RX and IDLE interrupt.
|
||||||
@ -28,14 +29,21 @@
|
|||||||
* 0x05 PID_SET - float kp, float ki, float kd (12 bytes, IEEE-754 LE)
|
* 0x05 PID_SET - float kp, float ki, float kd (12 bytes, IEEE-754 LE)
|
||||||
* 0x06 DFU_ENTER - no payload; request OTA DFU reboot (denied while armed)
|
* 0x06 DFU_ENTER - no payload; request OTA DFU reboot (denied while armed)
|
||||||
* 0x07 ESTOP - no payload; engage emergency stop
|
* 0x07 ESTOP - no payload; engage emergency stop
|
||||||
|
* 0x08 AUDIO - int16 PCM samples (up to 126 samples)
|
||||||
|
* 0x09 SLEEP - no payload; request STOP-mode sleep
|
||||||
* 0x0A PID_SAVE - no payload; save current Kp/Ki/Kd to flash (Issue #531)
|
* 0x0A PID_SAVE - no payload; save current Kp/Ki/Kd to flash (Issue #531)
|
||||||
* 0x0B GIMBAL_POS - int16 pan_x10, int16 tilt_x10, uint16 speed (Issue #547)
|
* 0x0B GIMBAL_POS - int16 pan_x10, int16 tilt_x10, uint16 speed (Issue #547)
|
||||||
|
* 0x0C SCHED_GET - no payload; reply with TLM_SCHED (Issue #550)
|
||||||
|
* 0x0D SCHED_SET - uint8 num_bands + N*16-byte pid_sched_entry_t (Issue #550)
|
||||||
|
* 0x0E SCHED_SAVE - float kp, ki, kd (12 bytes); save sched+single to flash (Issue #550)
|
||||||
*
|
*
|
||||||
* STM32 to Jetson telemetry:
|
* STM32 to Jetson telemetry:
|
||||||
* 0x80 STATUS - jlink_tlm_status_t (20 bytes), sent at JLINK_TLM_HZ
|
* 0x80 STATUS - jlink_tlm_status_t (20 bytes), sent at JLINK_TLM_HZ
|
||||||
* 0x81 POWER - jlink_tlm_power_t (11 bytes), sent at PM_TLM_HZ
|
* 0x81 POWER - jlink_tlm_power_t (11 bytes), sent at PM_TLM_HZ
|
||||||
|
* 0x82 BATTERY - jlink_tlm_battery_t (10 bytes, Issue #533)
|
||||||
* 0x83 PID_RESULT - jlink_tlm_pid_result_t (13 bytes), sent after PID_SAVE (Issue #531)
|
* 0x83 PID_RESULT - jlink_tlm_pid_result_t (13 bytes), sent after PID_SAVE (Issue #531)
|
||||||
* 0x84 GIMBAL_STATE - jlink_tlm_gimbal_state_t (10 bytes, Issue #547)
|
* 0x84 GIMBAL_STATE - jlink_tlm_gimbal_state_t (10 bytes, Issue #547)
|
||||||
|
* 0x85 SCHED - jlink_tlm_sched_t (1+N*16 bytes), sent on SCHED_GET (Issue #550)
|
||||||
*
|
*
|
||||||
* Priority: CRSF RC always takes precedence. Jetson steer/speed only applied
|
* Priority: CRSF RC always takes precedence. Jetson steer/speed only applied
|
||||||
* when mode_manager_active() == MODE_AUTONOMOUS (CH6 high). In RC_MANUAL and
|
* when mode_manager_active() == MODE_AUTONOMOUS (CH6 high). In RC_MANUAL and
|
||||||
@ -62,13 +70,17 @@
|
|||||||
#define JLINK_CMD_SLEEP 0x09u /* no payload; request STOP-mode sleep */
|
#define JLINK_CMD_SLEEP 0x09u /* no payload; request STOP-mode sleep */
|
||||||
#define JLINK_CMD_PID_SAVE 0x0Au /* no payload; save Kp/Ki/Kd to flash (Issue #531) */
|
#define JLINK_CMD_PID_SAVE 0x0Au /* no payload; save Kp/Ki/Kd to flash (Issue #531) */
|
||||||
#define JLINK_CMD_GIMBAL_POS 0x0Bu /* int16 pan_x10, int16 tilt_x10, uint16 speed (Issue #547) */
|
#define JLINK_CMD_GIMBAL_POS 0x0Bu /* int16 pan_x10, int16 tilt_x10, uint16 speed (Issue #547) */
|
||||||
|
#define JLINK_CMD_SCHED_GET 0x0Cu /* no payload; reply TLM_SCHED (Issue #550) */
|
||||||
|
#define JLINK_CMD_SCHED_SET 0x0Du /* uint8 num_bands + N*16-byte entries (Issue #550) */
|
||||||
|
#define JLINK_CMD_SCHED_SAVE 0x0Eu /* float kp,ki,kd; save sched+single to flash (Issue #550) */
|
||||||
|
|
||||||
/* ---- Telemetry IDs (STM32 to Jetson) ---- */
|
/* ---- Telemetry IDs (STM32 to Jetson) ---- */
|
||||||
#define JLINK_TLM_STATUS 0x80u
|
#define JLINK_TLM_STATUS 0x80u
|
||||||
#define JLINK_TLM_POWER 0x81u /* jlink_tlm_power_t (11 bytes) */
|
#define JLINK_TLM_POWER 0x81u /* jlink_tlm_power_t (11 bytes) */
|
||||||
#define JLINK_TLM_BATTERY 0x82u /* jlink_tlm_battery_t (10 bytes, Issue #533) */
|
#define JLINK_TLM_BATTERY 0x82u /* jlink_tlm_battery_t (10 bytes, Issue #533) */
|
||||||
#define JLINK_TLM_PID_RESULT 0x83u /* jlink_tlm_pid_result_t (13 bytes) Issue #531 */
|
#define JLINK_TLM_PID_RESULT 0x83u /* jlink_tlm_pid_result_t (13 bytes, Issue #531) */
|
||||||
#define JLINK_TLM_GIMBAL_STATE 0x84u /* jlink_tlm_gimbal_state_t (10 bytes, Issue #547) */
|
#define JLINK_TLM_GIMBAL_STATE 0x84u /* jlink_tlm_gimbal_state_t (10 bytes, Issue #547) */
|
||||||
|
#define JLINK_TLM_SCHED 0x85u /* jlink_tlm_sched_t (1+N*16 bytes, Issue #550) */
|
||||||
|
|
||||||
/* ---- Telemetry STATUS payload (20 bytes, packed) ---- */
|
/* ---- Telemetry STATUS payload (20 bytes, packed) ---- */
|
||||||
typedef struct __attribute__((packed)) {
|
typedef struct __attribute__((packed)) {
|
||||||
@ -98,15 +110,15 @@ typedef struct __attribute__((packed)) {
|
|||||||
uint32_t idle_ms; /* ms since last cmd_vel activity */
|
uint32_t idle_ms; /* ms since last cmd_vel activity */
|
||||||
} jlink_tlm_power_t; /* 11 bytes */
|
} jlink_tlm_power_t; /* 11 bytes */
|
||||||
|
|
||||||
/* ---- Telemetry BATTERY payload (10 bytes, packed) — Issue #533 ---- */
|
/* ---- Telemetry BATTERY payload (10 bytes, packed) Issue #533 ---- */
|
||||||
typedef struct __attribute__((packed)) {
|
typedef struct __attribute__((packed)) {
|
||||||
uint16_t vbat_mv; /* DMA-sampled LPF-filtered Vbat (mV) */
|
uint16_t vbat_mv; /* DMA-sampled LPF-filtered Vbat (mV) */
|
||||||
int16_t ibat_ma; /* DMA-sampled LPF-filtered Ibat (mA, + = discharge) */
|
int16_t ibat_ma; /* DMA-sampled LPF-filtered Ibat (mA, + = discharge) */
|
||||||
uint16_t vbat_raw_mv; /* unfiltered last-tick average (mV) */
|
uint16_t vbat_raw_mv; /* unfiltered last-tick average (mV) */
|
||||||
uint8_t flags; /* bit0=low, bit1=critical, bit2=4S, bit3=adc_ready */
|
uint8_t flags; /* bit0=low, bit1=critical, bit2=4S, bit3=adc_ready */
|
||||||
int8_t cal_offset; /* vbat_offset_mv / 4 (±127 → ±508 mV) */
|
int8_t cal_offset; /* vbat_offset_mv / 4 (+-127 -> +-508 mV) */
|
||||||
uint8_t lpf_shift; /* IIR shift factor (α = 1/2^lpf_shift) */
|
uint8_t lpf_shift; /* IIR shift factor (alpha = 1/2^lpf_shift) */
|
||||||
uint8_t soc_pct; /* voltage-based SoC 0–100, 255 = unknown */
|
uint8_t soc_pct; /* voltage-based SoC 0-100, 255 = unknown */
|
||||||
} jlink_tlm_battery_t; /* 10 bytes */
|
} jlink_tlm_battery_t; /* 10 bytes */
|
||||||
|
|
||||||
/* ---- Telemetry PID_RESULT payload (13 bytes, packed) Issue #531 ---- */
|
/* ---- Telemetry PID_RESULT payload (13 bytes, packed) Issue #531 ---- */
|
||||||
@ -129,6 +141,13 @@ typedef struct __attribute__((packed)) {
|
|||||||
uint8_t rx_err_pct; /* bus error rate 0-100% (rx_err*100/(rx_ok+rx_err)) */
|
uint8_t rx_err_pct; /* bus error rate 0-100% (rx_err*100/(rx_ok+rx_err)) */
|
||||||
} jlink_tlm_gimbal_state_t; /* 10 bytes */
|
} jlink_tlm_gimbal_state_t; /* 10 bytes */
|
||||||
|
|
||||||
|
/* ---- Telemetry SCHED payload (1 + N*16 bytes, packed) Issue #550 ---- */
|
||||||
|
/* Sent in response to JLINK_CMD_SCHED_GET; N = num_bands (1..PID_SCHED_MAX_BANDS). */
|
||||||
|
typedef struct __attribute__((packed)) {
|
||||||
|
uint8_t num_bands; /* number of valid entries */
|
||||||
|
pid_sched_entry_t bands[PID_SCHED_MAX_BANDS]; /* up to 6 x 16 = 96 bytes */
|
||||||
|
} jlink_tlm_sched_t; /* 1 + 96 = 97 bytes max */
|
||||||
|
|
||||||
/* ---- Volatile state (read from main loop) ---- */
|
/* ---- Volatile state (read from main loop) ---- */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/* Drive command - updated on JLINK_CMD_DRIVE */
|
/* Drive command - updated on JLINK_CMD_DRIVE */
|
||||||
@ -161,55 +180,37 @@ typedef struct {
|
|||||||
volatile int16_t gimbal_pan_x10; /* pan angle deg x10 */
|
volatile int16_t gimbal_pan_x10; /* pan angle deg x10 */
|
||||||
volatile int16_t gimbal_tilt_x10; /* tilt angle deg x10 */
|
volatile int16_t gimbal_tilt_x10; /* tilt angle deg x10 */
|
||||||
volatile uint16_t gimbal_speed; /* servo speed 0-4095 (0=max) */
|
volatile uint16_t gimbal_speed; /* servo speed 0-4095 (0=max) */
|
||||||
|
|
||||||
|
/* PID schedule commands (Issue #550) - set by parser, cleared by main loop */
|
||||||
|
volatile uint8_t sched_get_req; /* SCHED_GET: main loop calls jlink_send_sched_telemetry() */
|
||||||
|
volatile uint8_t sched_save_req; /* SCHED_SAVE: main loop calls pid_schedule_flash_save() */
|
||||||
|
volatile float sched_save_kp; /* kp for single-PID record in SCHED_SAVE */
|
||||||
|
volatile float sched_save_ki;
|
||||||
|
volatile float sched_save_kd;
|
||||||
} JLinkState;
|
} JLinkState;
|
||||||
|
|
||||||
extern volatile JLinkState jlink_state;
|
extern volatile JLinkState jlink_state;
|
||||||
|
|
||||||
|
/* ---- SCHED_SET receive buffer -- Issue #550 ---- */
|
||||||
|
/*
|
||||||
|
* Populated by the parser on JLINK_CMD_SCHED_SET. Main loop reads via
|
||||||
|
* jlink_get_sched_set() and calls pid_schedule_set_table() before clearing.
|
||||||
|
*/
|
||||||
|
typedef struct {
|
||||||
|
volatile uint8_t ready; /* set by parser, cleared by main loop */
|
||||||
|
volatile uint8_t num_bands;
|
||||||
|
pid_sched_entry_t bands[PID_SCHED_MAX_BANDS]; /* copied from frame */
|
||||||
|
} JLinkSchedSetBuf;
|
||||||
|
|
||||||
/* ---- API ---- */
|
/* ---- API ---- */
|
||||||
|
|
||||||
/*
|
|
||||||
* jlink_init() - configure USART1 (PB6=TX, PB7=RX) at 921600 baud with
|
|
||||||
* DMA2_Stream2_Channel4 circular RX (128-byte buffer) and IDLE interrupt.
|
|
||||||
* Call once before safety_init().
|
|
||||||
*/
|
|
||||||
void jlink_init(void);
|
void jlink_init(void);
|
||||||
|
|
||||||
/*
|
|
||||||
* jlink_is_active(now_ms) - returns true if a valid frame arrived within
|
|
||||||
* JLINK_HB_TIMEOUT_MS. Returns false if no frame ever received.
|
|
||||||
*/
|
|
||||||
bool jlink_is_active(uint32_t now_ms);
|
bool jlink_is_active(uint32_t now_ms);
|
||||||
|
|
||||||
/*
|
|
||||||
* jlink_send_telemetry(status) - build and transmit a JLINK_TLM_STATUS frame
|
|
||||||
* over USART1 TX (blocking, ~0.2ms at 921600). Call at JLINK_TLM_HZ.
|
|
||||||
*/
|
|
||||||
void jlink_send_telemetry(const jlink_tlm_status_t *status);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* jlink_process() - drain DMA circular buffer and parse frames.
|
|
||||||
* Call from main loop every iteration (not ISR). Lightweight: O(bytes_pending).
|
|
||||||
*/
|
|
||||||
void jlink_process(void);
|
void jlink_process(void);
|
||||||
|
void jlink_send_telemetry(const jlink_tlm_status_t *status);
|
||||||
/*
|
|
||||||
* jlink_send_power_telemetry(power) - build and transmit a JLINK_TLM_POWER
|
|
||||||
* frame (17 bytes) at PM_TLM_HZ. Call from main loop when not in STOP mode.
|
|
||||||
*/
|
|
||||||
void jlink_send_power_telemetry(const jlink_tlm_power_t *power);
|
void jlink_send_power_telemetry(const jlink_tlm_power_t *power);
|
||||||
|
|
||||||
/*
|
|
||||||
* jlink_send_pid_result(result) - build and transmit a JLINK_TLM_PID_RESULT
|
|
||||||
* frame (19 bytes) to confirm PID flash save outcome (Issue #531).
|
|
||||||
*/
|
|
||||||
void jlink_send_pid_result(const jlink_tlm_pid_result_t *result);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* jlink_send_battery_telemetry(batt) - build and transmit JLINK_TLM_BATTERY
|
|
||||||
* (0x82) frame (16 bytes) at BATTERY_ADC_PUBLISH_HZ (1 Hz).
|
|
||||||
* Called by battery_adc_publish(); not normally called directly.
|
|
||||||
*/
|
|
||||||
void jlink_send_battery_telemetry(const jlink_tlm_battery_t *batt);
|
void jlink_send_battery_telemetry(const jlink_tlm_battery_t *batt);
|
||||||
|
void jlink_send_pid_result(const jlink_tlm_pid_result_t *result);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* jlink_send_gimbal_state(state) - transmit JLINK_TLM_GIMBAL_STATE (0x84)
|
* jlink_send_gimbal_state(state) - transmit JLINK_TLM_GIMBAL_STATE (0x84)
|
||||||
@ -217,4 +218,18 @@ void jlink_send_battery_telemetry(const jlink_tlm_battery_t *batt);
|
|||||||
*/
|
*/
|
||||||
void jlink_send_gimbal_state(const jlink_tlm_gimbal_state_t *state);
|
void jlink_send_gimbal_state(const jlink_tlm_gimbal_state_t *state);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* jlink_send_sched_telemetry(tlm) - transmit JLINK_TLM_SCHED (0x85) in
|
||||||
|
* response to SCHED_GET. tlm->num_bands determines actual frame size.
|
||||||
|
* Issue #550.
|
||||||
|
*/
|
||||||
|
void jlink_send_sched_telemetry(const jlink_tlm_sched_t *tlm);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* jlink_get_sched_set() - return pointer to the most-recently received
|
||||||
|
* SCHED_SET payload buffer (static storage in jlink.c). Main loop calls
|
||||||
|
* pid_schedule_set_table() from this buffer, then clears ready. Issue #550.
|
||||||
|
*/
|
||||||
|
JLinkSchedSetBuf *jlink_get_sched_set(void);
|
||||||
|
|
||||||
#endif /* JLINK_H */
|
#endif /* JLINK_H */
|
||||||
|
|||||||
@ -0,0 +1,66 @@
|
|||||||
|
sensor_health_node:
|
||||||
|
ros__parameters:
|
||||||
|
# Publish rate for DiagnosticArray and JSON summary
|
||||||
|
check_hz: 1.0
|
||||||
|
|
||||||
|
# MQTT broker for saltybot/health topic
|
||||||
|
mqtt_broker: "localhost"
|
||||||
|
mqtt_port: 1883
|
||||||
|
mqtt_topic: "saltybot/health"
|
||||||
|
mqtt_enabled: true
|
||||||
|
|
||||||
|
# Per-sensor thresholds and configuration
|
||||||
|
# Each entry: topic, name, warn_s (WARN threshold), error_s (ERROR threshold), critical
|
||||||
|
#
|
||||||
|
# critical=true: system cannot operate without this sensor
|
||||||
|
# warn_s: topic age (s) that triggers WARN level
|
||||||
|
# error_s: topic age (s) that triggers ERROR level
|
||||||
|
|
||||||
|
sensors:
|
||||||
|
- topic: "/camera/color/image_raw"
|
||||||
|
name: "camera_color"
|
||||||
|
warn_s: 1.0
|
||||||
|
error_s: 3.0
|
||||||
|
critical: true
|
||||||
|
|
||||||
|
- topic: "/camera/depth/image_rect_raw"
|
||||||
|
name: "camera_depth"
|
||||||
|
warn_s: 1.0
|
||||||
|
error_s: 3.0
|
||||||
|
critical: true
|
||||||
|
|
||||||
|
- topic: "/camera/color/camera_info"
|
||||||
|
name: "camera_info"
|
||||||
|
warn_s: 2.0
|
||||||
|
error_s: 5.0
|
||||||
|
critical: false
|
||||||
|
|
||||||
|
- topic: "/scan"
|
||||||
|
name: "lidar"
|
||||||
|
warn_s: 1.0
|
||||||
|
error_s: 3.0
|
||||||
|
critical: true
|
||||||
|
|
||||||
|
- topic: "/imu/data"
|
||||||
|
name: "imu"
|
||||||
|
warn_s: 0.5
|
||||||
|
error_s: 2.0
|
||||||
|
critical: true
|
||||||
|
|
||||||
|
- topic: "/saltybot/uwb/range"
|
||||||
|
name: "uwb"
|
||||||
|
warn_s: 2.0
|
||||||
|
error_s: 5.0
|
||||||
|
critical: false
|
||||||
|
|
||||||
|
- topic: "/saltybot/battery"
|
||||||
|
name: "battery"
|
||||||
|
warn_s: 3.0
|
||||||
|
error_s: 8.0
|
||||||
|
critical: false
|
||||||
|
|
||||||
|
- topic: "/saltybot/motor_daemon/status"
|
||||||
|
name: "motor_daemon"
|
||||||
|
warn_s: 2.0
|
||||||
|
error_s: 5.0
|
||||||
|
critical: true
|
||||||
@ -0,0 +1,50 @@
|
|||||||
|
"""sensor_health.launch.py — Launch sensor health monitor node (Issue #566)."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from ament_index_python.packages import get_package_share_directory
|
||||||
|
from launch import LaunchDescription
|
||||||
|
from launch.actions import DeclareLaunchArgument
|
||||||
|
from launch.substitutions import LaunchConfiguration
|
||||||
|
from launch_ros.actions import Node
|
||||||
|
|
||||||
|
|
||||||
|
def generate_launch_description() -> LaunchDescription:
|
||||||
|
pkg = get_package_share_directory("saltybot_health_monitor")
|
||||||
|
|
||||||
|
check_hz_arg = DeclareLaunchArgument(
|
||||||
|
"check_hz",
|
||||||
|
default_value="1.0",
|
||||||
|
description="Health check publish rate (Hz)",
|
||||||
|
)
|
||||||
|
mqtt_broker_arg = DeclareLaunchArgument(
|
||||||
|
"mqtt_broker",
|
||||||
|
default_value="localhost",
|
||||||
|
description="MQTT broker hostname",
|
||||||
|
)
|
||||||
|
mqtt_enabled_arg = DeclareLaunchArgument(
|
||||||
|
"mqtt_enabled",
|
||||||
|
default_value="true",
|
||||||
|
description="Enable MQTT publishing to saltybot/health",
|
||||||
|
)
|
||||||
|
|
||||||
|
sensor_health_node = Node(
|
||||||
|
package="saltybot_health_monitor",
|
||||||
|
executable="sensor_health_node",
|
||||||
|
name="sensor_health_node",
|
||||||
|
output="screen",
|
||||||
|
parameters=[
|
||||||
|
os.path.join(pkg, "config", "sensor_health_params.yaml"),
|
||||||
|
{
|
||||||
|
"check_hz": LaunchConfiguration("check_hz"),
|
||||||
|
"mqtt_broker": LaunchConfiguration("mqtt_broker"),
|
||||||
|
"mqtt_enabled": LaunchConfiguration("mqtt_enabled"),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
return LaunchDescription([
|
||||||
|
check_hz_arg,
|
||||||
|
mqtt_broker_arg,
|
||||||
|
mqtt_enabled_arg,
|
||||||
|
sensor_health_node,
|
||||||
|
])
|
||||||
@ -1,27 +1,24 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
|
|
||||||
<package format="3">
|
<package format="3">
|
||||||
<name>saltybot_health_monitor</name>
|
<name>saltybot_health_monitor</name>
|
||||||
<version>0.1.0</version>
|
<version>0.2.0</version>
|
||||||
<description>
|
<description>
|
||||||
ROS2 system health monitor for SaltyBot. Central node that monitors heartbeats
|
ROS2 health monitor for SaltyBot. Tracks node heartbeats and sensor topic
|
||||||
from all critical nodes, detects when nodes go down (>5s silent), triggers
|
staleness. Publishes DiagnosticArray on /saltybot/diagnostics and MQTT on
|
||||||
auto-restart, publishes /saltybot/system_health JSON, and alerts face display
|
saltybot/health. Issue #566.
|
||||||
on critical failures.
|
|
||||||
</description>
|
</description>
|
||||||
<maintainer email="sl-controls@saltylab.local">sl-controls</maintainer>
|
<maintainer email="sl-jetson@saltylab.local">sl-jetson</maintainer>
|
||||||
<license>MIT</license>
|
<license>Apache-2.0</license>
|
||||||
|
|
||||||
<depend>rclpy</depend>
|
<depend>rclpy</depend>
|
||||||
<depend>std_msgs</depend>
|
<depend>std_msgs</depend>
|
||||||
<depend>geometry_msgs</depend>
|
<depend>geometry_msgs</depend>
|
||||||
|
<depend>sensor_msgs</depend>
|
||||||
|
<depend>diagnostic_msgs</depend>
|
||||||
|
|
||||||
<buildtool_depend>ament_python</buildtool_depend>
|
<buildtool_depend>ament_python</buildtool_depend>
|
||||||
|
|
||||||
<test_depend>ament_copyright</test_depend>
|
<test_depend>pytest</test_depend>
|
||||||
<test_depend>ament_flake8</test_depend>
|
|
||||||
<test_depend>ament_pep257</test_depend>
|
|
||||||
<test_depend>python3-pytest</test_depend>
|
|
||||||
|
|
||||||
<export>
|
<export>
|
||||||
<build_type>ament_python</build_type>
|
<build_type>ament_python</build_type>
|
||||||
|
|||||||
@ -0,0 +1,385 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""sensor_health_node.py — ROS2 sensor topic health monitor (Issue #566).
|
||||||
|
|
||||||
|
Monitors all sensor topics for staleness. Each sensor is checked against
|
||||||
|
configurable WARN and ERROR timeouts. Results are published as a ROS2
|
||||||
|
DiagnosticArray and as an MQTT JSON summary.
|
||||||
|
|
||||||
|
Monitored topics (configurable via sensor_health_params.yaml):
|
||||||
|
/camera/color/image_raw — RealSense colour stream
|
||||||
|
/camera/depth/image_rect_raw — RealSense depth stream
|
||||||
|
/camera/color/camera_info — RealSense camera info
|
||||||
|
/scan — LiDAR 2-D scan
|
||||||
|
/imu/data — IMU (BNO055 via JLink)
|
||||||
|
/saltybot/uwb/range — UWB ranging
|
||||||
|
/saltybot/battery — Battery telemetry (JSON string)
|
||||||
|
/saltybot/motor_daemon/status — Motor daemon status
|
||||||
|
|
||||||
|
Published topics:
|
||||||
|
/saltybot/diagnostics (diagnostic_msgs/DiagnosticArray) — full per-sensor status
|
||||||
|
/saltybot/sensor_health (std_msgs/String) — JSON summary
|
||||||
|
|
||||||
|
MQTT:
|
||||||
|
Topic: saltybot/health
|
||||||
|
Payload: JSON {timestamp, overall, sensors:{name: {status, age_s, hz}}}
|
||||||
|
|
||||||
|
Parameters (config/sensor_health_params.yaml):
|
||||||
|
check_hz 1.0 Health check publish rate
|
||||||
|
mqtt_broker localhost
|
||||||
|
mqtt_port 1883
|
||||||
|
mqtt_topic saltybot/health
|
||||||
|
mqtt_enabled true
|
||||||
|
sensors list of {topic, name, warn_s, error_s, critical}
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import rclpy
|
||||||
|
from rclpy.node import Node
|
||||||
|
from rclpy.qos import (
|
||||||
|
DurabilityPolicy, HistoryPolicy, QoSProfile, ReliabilityPolicy
|
||||||
|
)
|
||||||
|
|
||||||
|
from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
|
||||||
|
from sensor_msgs.msg import CameraInfo, Image, Imu, LaserScan
|
||||||
|
from std_msgs.msg import String
|
||||||
|
|
||||||
|
|
||||||
|
# ── Diagnostic level aliases ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
OK = DiagnosticStatus.OK # 0
|
||||||
|
WARN = DiagnosticStatus.WARN # 1
|
||||||
|
ERROR = DiagnosticStatus.ERROR # 2
|
||||||
|
|
||||||
|
_LEVEL_NAMES = {OK: "OK", WARN: "WARN", ERROR: "ERROR"}
|
||||||
|
|
||||||
|
# ── Default sensor configuration ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
DEFAULT_SENSORS: List[dict] = [
|
||||||
|
{"topic": "/camera/color/image_raw", "name": "camera_color", "warn_s": 1.0, "error_s": 3.0, "critical": True},
|
||||||
|
{"topic": "/camera/depth/image_rect_raw", "name": "camera_depth", "warn_s": 1.0, "error_s": 3.0, "critical": True},
|
||||||
|
{"topic": "/camera/color/camera_info", "name": "camera_info", "warn_s": 2.0, "error_s": 5.0, "critical": False},
|
||||||
|
{"topic": "/scan", "name": "lidar", "warn_s": 1.0, "error_s": 3.0, "critical": True},
|
||||||
|
{"topic": "/imu/data", "name": "imu", "warn_s": 0.5, "error_s": 2.0, "critical": True},
|
||||||
|
{"topic": "/saltybot/uwb/range", "name": "uwb", "warn_s": 2.0, "error_s": 5.0, "critical": False},
|
||||||
|
{"topic": "/saltybot/battery", "name": "battery", "warn_s": 3.0, "error_s": 8.0, "critical": False},
|
||||||
|
{"topic": "/saltybot/motor_daemon/status", "name": "motor_daemon", "warn_s": 2.0, "error_s": 5.0, "critical": True},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ── SensorWatcher ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class SensorWatcher:
|
||||||
|
"""Tracks message receipt timestamps and rate for a single topic.
|
||||||
|
|
||||||
|
Thread-safe: callback may fire from any executor thread.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, topic: str, name: str,
|
||||||
|
warn_s: float, error_s: float, critical: bool) -> None:
|
||||||
|
self.topic = topic
|
||||||
|
self.name = name
|
||||||
|
self.warn_s = warn_s
|
||||||
|
self.error_s = error_s
|
||||||
|
self.critical = critical
|
||||||
|
|
||||||
|
self._last_rx: float = 0.0 # monotonic; 0 = never received
|
||||||
|
self._count: int = 0
|
||||||
|
self._rate_hz: float = 0.0
|
||||||
|
self._rate_ts: float = time.monotonic()
|
||||||
|
self._rate_cnt: int = 0
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
# ── Callback (called from subscription) ────────────────────────────────
|
||||||
|
|
||||||
|
def on_message(self, _msg) -> None:
|
||||||
|
"""Record receipt of any message on this topic."""
|
||||||
|
now = time.monotonic()
|
||||||
|
with self._lock:
|
||||||
|
self._last_rx = now
|
||||||
|
self._count += 1
|
||||||
|
self._rate_cnt += 1
|
||||||
|
# Update rate estimate every ~2 s
|
||||||
|
elapsed = now - self._rate_ts
|
||||||
|
if elapsed >= 2.0:
|
||||||
|
self._rate_hz = self._rate_cnt / elapsed
|
||||||
|
self._rate_cnt = 0
|
||||||
|
self._rate_ts = now
|
||||||
|
|
||||||
|
# ── Status query ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def age_s(self, now: Optional[float] = None) -> float:
|
||||||
|
"""Seconds since last message (∞ if never received)."""
|
||||||
|
if now is None:
|
||||||
|
now = time.monotonic()
|
||||||
|
with self._lock:
|
||||||
|
if self._last_rx == 0.0:
|
||||||
|
return float("inf")
|
||||||
|
return now - self._last_rx
|
||||||
|
|
||||||
|
def rate_hz(self) -> float:
|
||||||
|
with self._lock:
|
||||||
|
return self._rate_hz
|
||||||
|
|
||||||
|
def msg_count(self) -> int:
|
||||||
|
with self._lock:
|
||||||
|
return self._count
|
||||||
|
|
||||||
|
def level(self, now: Optional[float] = None) -> int:
|
||||||
|
age = self.age_s(now)
|
||||||
|
if age >= self.error_s:
|
||||||
|
return ERROR
|
||||||
|
if age >= self.warn_s:
|
||||||
|
return WARN
|
||||||
|
return OK
|
||||||
|
|
||||||
|
def diagnostic_status(self, now: Optional[float] = None) -> DiagnosticStatus:
|
||||||
|
if now is None:
|
||||||
|
now = time.monotonic()
|
||||||
|
age = self.age_s(now)
|
||||||
|
lvl = self.level(now)
|
||||||
|
hz = self.rate_hz()
|
||||||
|
cnt = self.msg_count()
|
||||||
|
|
||||||
|
if age == float("inf"):
|
||||||
|
msg = f"ERROR: no data received on {self.topic}"
|
||||||
|
elif lvl == ERROR:
|
||||||
|
msg = f"ERROR: stale {age:.1f}s (threshold {self.error_s:.1f}s)"
|
||||||
|
elif lvl == WARN:
|
||||||
|
msg = f"WARN: stale {age:.1f}s (threshold {self.warn_s:.1f}s)"
|
||||||
|
else:
|
||||||
|
msg = f"OK ({hz:.1f} Hz)"
|
||||||
|
|
||||||
|
age_str = "inf" if age == float("inf") else f"{age:.2f}"
|
||||||
|
status = DiagnosticStatus()
|
||||||
|
status.level = lvl
|
||||||
|
status.name = self.name
|
||||||
|
status.message = msg
|
||||||
|
status.hardware_id = self.topic
|
||||||
|
status.values = [
|
||||||
|
KeyValue(key="topic", value=self.topic),
|
||||||
|
KeyValue(key="age_s", value=age_str),
|
||||||
|
KeyValue(key="rate_hz", value=f"{hz:.2f}"),
|
||||||
|
KeyValue(key="count", value=str(cnt)),
|
||||||
|
KeyValue(key="warn_s", value=str(self.warn_s)),
|
||||||
|
KeyValue(key="error_s", value=str(self.error_s)),
|
||||||
|
KeyValue(key="critical", value=str(self.critical)),
|
||||||
|
]
|
||||||
|
return status
|
||||||
|
|
||||||
|
def summary_dict(self, now: Optional[float] = None) -> dict:
|
||||||
|
if now is None:
|
||||||
|
now = time.monotonic()
|
||||||
|
age = self.age_s(now)
|
||||||
|
return {
|
||||||
|
"status": _LEVEL_NAMES[self.level(now)],
|
||||||
|
"age_s": round(age, 2) if age != float("inf") else None,
|
||||||
|
"rate_hz": round(self.rate_hz(), 2),
|
||||||
|
"count": self.msg_count(),
|
||||||
|
"critical": self.critical,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── SensorHealthNode ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class SensorHealthNode(Node):
|
||||||
|
"""Monitor all sensor topics; publish DiagnosticArray + MQTT JSON."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__("sensor_health_node")
|
||||||
|
|
||||||
|
# ── Parameters ─────────────────────────────────────────────────────
|
||||||
|
self.declare_parameter("check_hz", 1.0)
|
||||||
|
self.declare_parameter("mqtt_broker", "localhost")
|
||||||
|
self.declare_parameter("mqtt_port", 1883)
|
||||||
|
self.declare_parameter("mqtt_topic", "saltybot/health")
|
||||||
|
self.declare_parameter("mqtt_enabled", True)
|
||||||
|
|
||||||
|
check_hz = self.get_parameter("check_hz").value
|
||||||
|
self._mqtt_broker = self.get_parameter("mqtt_broker").value
|
||||||
|
self._mqtt_port = int(self.get_parameter("mqtt_port").value)
|
||||||
|
self._mqtt_topic = self.get_parameter("mqtt_topic").value
|
||||||
|
mqtt_enabled = self.get_parameter("mqtt_enabled").value
|
||||||
|
|
||||||
|
# ── Build sensor watchers ───────────────────────────────────────────
|
||||||
|
self._watchers: Dict[str, SensorWatcher] = {}
|
||||||
|
for cfg in DEFAULT_SENSORS:
|
||||||
|
w = SensorWatcher(
|
||||||
|
topic = cfg["topic"],
|
||||||
|
name = cfg["name"],
|
||||||
|
warn_s = float(cfg.get("warn_s", 1.0)),
|
||||||
|
error_s = float(cfg.get("error_s", 3.0)),
|
||||||
|
critical = bool(cfg.get("critical", False)),
|
||||||
|
)
|
||||||
|
self._watchers[cfg["name"]] = w
|
||||||
|
|
||||||
|
# ── Subscriptions — one per sensor type ────────────────────────────
|
||||||
|
# Best-effort QoS for sensor data (sensors may use BEST_EFFORT publishers)
|
||||||
|
be_qos = QoSProfile(
|
||||||
|
history=HistoryPolicy.KEEP_LAST, depth=1,
|
||||||
|
reliability=ReliabilityPolicy.BEST_EFFORT,
|
||||||
|
durability=DurabilityPolicy.VOLATILE,
|
||||||
|
)
|
||||||
|
rel_qos = QoSProfile(
|
||||||
|
history=HistoryPolicy.KEEP_LAST, depth=5,
|
||||||
|
reliability=ReliabilityPolicy.RELIABLE,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._subscribe(Image, "/camera/color/image_raw", "camera_color", be_qos)
|
||||||
|
self._subscribe(Image, "/camera/depth/image_rect_raw", "camera_depth", be_qos)
|
||||||
|
self._subscribe(CameraInfo, "/camera/color/camera_info", "camera_info", be_qos)
|
||||||
|
self._subscribe(LaserScan, "/scan", "lidar", be_qos)
|
||||||
|
self._subscribe(Imu, "/imu/data", "imu", be_qos)
|
||||||
|
self._subscribe(String, "/saltybot/uwb/range", "uwb", rel_qos)
|
||||||
|
self._subscribe(String, "/saltybot/battery", "battery", rel_qos)
|
||||||
|
self._subscribe(String, "/saltybot/motor_daemon/status", "motor_daemon", rel_qos)
|
||||||
|
|
||||||
|
# ── Publishers ─────────────────────────────────────────────────────
|
||||||
|
self._pub_diag = self.create_publisher(
|
||||||
|
DiagnosticArray, "/saltybot/diagnostics", 10)
|
||||||
|
self._pub_health = self.create_publisher(
|
||||||
|
String, "/saltybot/sensor_health", 10)
|
||||||
|
|
||||||
|
# ── MQTT ───────────────────────────────────────────────────────────
|
||||||
|
self._mqtt_client = None
|
||||||
|
if mqtt_enabled:
|
||||||
|
self._connect_mqtt()
|
||||||
|
|
||||||
|
# ── Timer ──────────────────────────────────────────────────────────
|
||||||
|
self.create_timer(1.0 / max(0.1, check_hz), self._publish_diagnostics)
|
||||||
|
|
||||||
|
sensor_list = ", ".join(self._watchers.keys())
|
||||||
|
self.get_logger().info(
|
||||||
|
f"[sensor_health] monitoring {len(self._watchers)} sensors: {sensor_list}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Subscription helper ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _subscribe(self, msg_type, topic: str, name: str, qos) -> None:
|
||||||
|
if name not in self._watchers:
|
||||||
|
return
|
||||||
|
watcher = self._watchers[name]
|
||||||
|
self.create_subscription(msg_type, topic, watcher.on_message, qos)
|
||||||
|
|
||||||
|
# ── MQTT ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _connect_mqtt(self) -> None:
|
||||||
|
try:
|
||||||
|
import paho.mqtt.client as mqtt # type: ignore
|
||||||
|
self._mqtt_client = mqtt.Client(client_id="saltybot_sensor_health")
|
||||||
|
self._mqtt_client.on_connect = self._on_mqtt_connect
|
||||||
|
self._mqtt_client.connect_async(self._mqtt_broker, self._mqtt_port, 60)
|
||||||
|
self._mqtt_client.loop_start()
|
||||||
|
except ImportError:
|
||||||
|
self.get_logger().warn(
|
||||||
|
"[sensor_health] paho-mqtt not installed — MQTT publishing disabled"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.get_logger().warn(f"[sensor_health] MQTT connect failed: {e}")
|
||||||
|
|
||||||
|
def _on_mqtt_connect(self, client, userdata, flags, rc) -> None:
|
||||||
|
if rc == 0:
|
||||||
|
self.get_logger().info(
|
||||||
|
f"[sensor_health] MQTT connected to {self._mqtt_broker}:{self._mqtt_port}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.get_logger().warn(f"[sensor_health] MQTT connect rc={rc}")
|
||||||
|
|
||||||
|
def _publish_mqtt(self, payload: str) -> None:
|
||||||
|
if self._mqtt_client is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self._mqtt_client.publish(self._mqtt_topic, payload, qos=0, retain=True)
|
||||||
|
except Exception as e:
|
||||||
|
self.get_logger().warn(f"[sensor_health] MQTT publish error: {e}")
|
||||||
|
|
||||||
|
# ── Diagnostic publisher ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _publish_diagnostics(self) -> None:
|
||||||
|
now = time.monotonic()
|
||||||
|
wall = time.time()
|
||||||
|
|
||||||
|
# Build DiagnosticArray
|
||||||
|
diag_array = DiagnosticArray()
|
||||||
|
diag_array.header.stamp = self.get_clock().now().to_msg()
|
||||||
|
|
||||||
|
sensor_summaries: dict = {}
|
||||||
|
worst_level = OK
|
||||||
|
|
||||||
|
for name, watcher in self._watchers.items():
|
||||||
|
ds = watcher.diagnostic_status(now)
|
||||||
|
diag_array.status.append(ds)
|
||||||
|
|
||||||
|
if ds.level > worst_level:
|
||||||
|
worst_level = ds.level
|
||||||
|
|
||||||
|
sensor_summaries[name] = watcher.summary_dict(now)
|
||||||
|
|
||||||
|
# Overall system status
|
||||||
|
n_error = sum(1 for w in self._watchers.values() if w.level(now) == ERROR)
|
||||||
|
n_warn = sum(1 for w in self._watchers.values() if w.level(now) == WARN)
|
||||||
|
crit_err = [n for n, w in self._watchers.items()
|
||||||
|
if w.critical and w.level(now) == ERROR]
|
||||||
|
|
||||||
|
overall = "OK"
|
||||||
|
if crit_err:
|
||||||
|
overall = "ERROR"
|
||||||
|
elif n_error > 0:
|
||||||
|
overall = "ERROR"
|
||||||
|
elif n_warn > 0:
|
||||||
|
overall = "WARN"
|
||||||
|
|
||||||
|
# Publish DiagnosticArray
|
||||||
|
self._pub_diag.publish(diag_array)
|
||||||
|
|
||||||
|
# JSON summary
|
||||||
|
summary = {
|
||||||
|
"timestamp": wall,
|
||||||
|
"overall": overall,
|
||||||
|
"n_ok": len(self._watchers) - n_error - n_warn,
|
||||||
|
"n_warn": n_warn,
|
||||||
|
"n_error": n_error,
|
||||||
|
"critical_err": crit_err,
|
||||||
|
"sensors": sensor_summaries,
|
||||||
|
}
|
||||||
|
payload = json.dumps(summary)
|
||||||
|
self._pub_health.publish(String(data=payload))
|
||||||
|
self._publish_mqtt(payload)
|
||||||
|
|
||||||
|
# Log on transitions or errors
|
||||||
|
if worst_level >= ERROR:
|
||||||
|
self.get_logger().warn(
|
||||||
|
f"[sensor_health] {overall}: {n_error} error(s), {n_warn} warn(s)"
|
||||||
|
+ (f" — critical: {crit_err}" if crit_err else "")
|
||||||
|
)
|
||||||
|
|
||||||
|
def destroy_node(self) -> None:
|
||||||
|
if self._mqtt_client is not None:
|
||||||
|
try:
|
||||||
|
self._mqtt_client.loop_stop()
|
||||||
|
self._mqtt_client.disconnect()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
super().destroy_node()
|
||||||
|
|
||||||
|
|
||||||
|
def main(args=None) -> None:
|
||||||
|
rclpy.init(args=args)
|
||||||
|
node = SensorHealthNode()
|
||||||
|
try:
|
||||||
|
rclpy.spin(node)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
node.destroy_node()
|
||||||
|
rclpy.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -4,27 +4,34 @@ package_name = "saltybot_health_monitor"
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name=package_name,
|
name=package_name,
|
||||||
version="0.1.0",
|
version="0.2.0",
|
||||||
packages=[package_name],
|
packages=[package_name],
|
||||||
data_files=[
|
data_files=[
|
||||||
("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
|
("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
|
||||||
(f"share/{package_name}", ["package.xml"]),
|
(f"share/{package_name}", ["package.xml"]),
|
||||||
(f"share/{package_name}/launch", ["launch/health_monitor.launch.py"]),
|
(f"share/{package_name}/launch", [
|
||||||
(f"share/{package_name}/config", ["config/health_config.yaml"]),
|
"launch/health_monitor.launch.py",
|
||||||
|
"launch/sensor_health.launch.py",
|
||||||
|
]),
|
||||||
|
(f"share/{package_name}/config", [
|
||||||
|
"config/health_config.yaml",
|
||||||
|
"config/sensor_health_params.yaml",
|
||||||
|
]),
|
||||||
],
|
],
|
||||||
install_requires=["setuptools", "pyyaml"],
|
install_requires=["setuptools", "pyyaml", "paho-mqtt"],
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
maintainer="sl-controls",
|
maintainer="sl-jetson",
|
||||||
maintainer_email="sl-controls@saltylab.local",
|
maintainer_email="sl-jetson@saltylab.local",
|
||||||
description=(
|
description=(
|
||||||
"System health monitor: tracks node heartbeats, detects down nodes, "
|
"System health monitor: node heartbeats + sensor topic staleness "
|
||||||
"triggers auto-restart, publishes system health status"
|
"detection with DiagnosticArray and MQTT (Issue #566)"
|
||||||
),
|
),
|
||||||
license="MIT",
|
license="Apache-2.0",
|
||||||
tests_require=["pytest"],
|
tests_require=["pytest"],
|
||||||
entry_points={
|
entry_points={
|
||||||
"console_scripts": [
|
"console_scripts": [
|
||||||
"health_monitor_node = saltybot_health_monitor.health_monitor_node:main",
|
"health_monitor_node = saltybot_health_monitor.health_monitor_node:main",
|
||||||
|
"sensor_health_node = saltybot_health_monitor.sensor_health_node:main",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@ -0,0 +1,344 @@
|
|||||||
|
"""test_sensor_health.py — Unit tests for sensor_health_node (Issue #566).
|
||||||
|
|
||||||
|
Runs entirely offline: no ROS2 runtime, no hardware, no MQTT broker required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import types
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
# ── Stub ROS2 and sensor_msgs so tests run offline ────────────────────────────
|
||||||
|
|
||||||
|
def _install_stubs():
|
||||||
|
# rclpy
|
||||||
|
rclpy = types.ModuleType("rclpy")
|
||||||
|
rclpy.init = lambda **_: None
|
||||||
|
rclpy.spin = lambda _: None
|
||||||
|
rclpy.shutdown = lambda: None
|
||||||
|
|
||||||
|
node_mod = types.ModuleType("rclpy.node")
|
||||||
|
class _Node:
|
||||||
|
def __init__(self, *a, **kw): pass
|
||||||
|
def declare_parameter(self, *a, **kw): pass
|
||||||
|
def get_parameter(self, name):
|
||||||
|
defaults = {
|
||||||
|
"check_hz": 1.0, "mqtt_broker": "localhost",
|
||||||
|
"mqtt_port": 1883, "mqtt_topic": "saltybot/health",
|
||||||
|
"mqtt_enabled": False,
|
||||||
|
}
|
||||||
|
m = MagicMock(); m.value = defaults.get(name, False)
|
||||||
|
return m
|
||||||
|
def get_logger(self): return MagicMock()
|
||||||
|
def get_clock(self): return MagicMock()
|
||||||
|
def create_subscription(self, *a, **kw): pass
|
||||||
|
def create_publisher(self, *a, **kw): return MagicMock()
|
||||||
|
def create_timer(self, *a, **kw): pass
|
||||||
|
def destroy_node(self): pass
|
||||||
|
node_mod.Node = _Node
|
||||||
|
rclpy.node = node_mod
|
||||||
|
|
||||||
|
qos_mod = types.ModuleType("rclpy.qos")
|
||||||
|
for attr in ("QoSProfile", "HistoryPolicy", "ReliabilityPolicy", "DurabilityPolicy"):
|
||||||
|
setattr(qos_mod, attr, MagicMock())
|
||||||
|
rclpy.qos = qos_mod
|
||||||
|
|
||||||
|
sys.modules.setdefault("rclpy", rclpy)
|
||||||
|
sys.modules.setdefault("rclpy.node", rclpy.node)
|
||||||
|
sys.modules.setdefault("rclpy.qos", rclpy.qos)
|
||||||
|
|
||||||
|
# diagnostic_msgs
|
||||||
|
diag = types.ModuleType("diagnostic_msgs")
|
||||||
|
diag_msg = types.ModuleType("diagnostic_msgs.msg")
|
||||||
|
|
||||||
|
class _DiagStatus:
|
||||||
|
OK = 0
|
||||||
|
WARN = 1
|
||||||
|
ERROR = 2
|
||||||
|
def __init__(self):
|
||||||
|
self.level = 0; self.name = ""; self.message = ""
|
||||||
|
self.hardware_id = ""; self.values = []
|
||||||
|
|
||||||
|
class _DiagArray:
|
||||||
|
def __init__(self):
|
||||||
|
self.header = MagicMock(); self.status = []
|
||||||
|
|
||||||
|
class _KeyValue:
|
||||||
|
def __init__(self, key="", value=""):
|
||||||
|
self.key = key; self.value = value
|
||||||
|
|
||||||
|
diag_msg.DiagnosticStatus = _DiagStatus
|
||||||
|
diag_msg.DiagnosticArray = _DiagArray
|
||||||
|
diag_msg.KeyValue = _KeyValue
|
||||||
|
diag.msg = diag_msg
|
||||||
|
sys.modules.setdefault("diagnostic_msgs", diag)
|
||||||
|
sys.modules.setdefault("diagnostic_msgs.msg", diag_msg)
|
||||||
|
|
||||||
|
# sensor_msgs
|
||||||
|
sens = types.ModuleType("sensor_msgs")
|
||||||
|
sens_msg = types.ModuleType("sensor_msgs.msg")
|
||||||
|
for cls_name in ("Image", "CameraInfo", "Imu", "LaserScan"):
|
||||||
|
setattr(sens_msg, cls_name, MagicMock)
|
||||||
|
sens.msg = sens_msg
|
||||||
|
sys.modules.setdefault("sensor_msgs", sens)
|
||||||
|
sys.modules.setdefault("sensor_msgs.msg", sens_msg)
|
||||||
|
|
||||||
|
# std_msgs
|
||||||
|
std = types.ModuleType("std_msgs")
|
||||||
|
std_msg = types.ModuleType("std_msgs.msg")
|
||||||
|
class _String:
|
||||||
|
def __init__(self, data=""): self.data = data
|
||||||
|
std_msg.String = _String
|
||||||
|
std.msg = std_msg
|
||||||
|
sys.modules.setdefault("std_msgs", std)
|
||||||
|
sys.modules.setdefault("std_msgs.msg", std_msg)
|
||||||
|
|
||||||
|
|
||||||
|
_install_stubs()
|
||||||
|
|
||||||
|
from saltybot_health_monitor.sensor_health_node import ( # noqa: E402
|
||||||
|
SensorWatcher, OK, WARN, ERROR, _LEVEL_NAMES,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── SensorWatcher: initial state ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestSensorWatcherInitial(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=True)
|
||||||
|
|
||||||
|
def test_initial_age_is_inf(self):
|
||||||
|
self.assertEqual(self.w.age_s(), float("inf"))
|
||||||
|
|
||||||
|
def test_initial_level_is_error(self):
|
||||||
|
# Never received → age=inf ≥ error_s → ERROR
|
||||||
|
self.assertEqual(self.w.level(), ERROR)
|
||||||
|
|
||||||
|
def test_initial_count_zero(self):
|
||||||
|
self.assertEqual(self.w.msg_count(), 0)
|
||||||
|
|
||||||
|
def test_initial_rate_zero(self):
|
||||||
|
self.assertAlmostEqual(self.w.rate_hz(), 0.0)
|
||||||
|
|
||||||
|
def test_name_stored(self):
|
||||||
|
self.assertEqual(self.w.name, "lidar")
|
||||||
|
|
||||||
|
def test_topic_stored(self):
|
||||||
|
self.assertEqual(self.w.topic, "/scan")
|
||||||
|
|
||||||
|
def test_critical_stored(self):
|
||||||
|
self.assertTrue(self.w.critical)
|
||||||
|
|
||||||
|
|
||||||
|
# ── SensorWatcher: after receiving messages ───────────────────────────────────
|
||||||
|
|
||||||
|
class TestSensorWatcherAfterMessage(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.w = SensorWatcher("/imu/data", "imu", warn_s=0.5, error_s=2.0, critical=True)
|
||||||
|
self.w.on_message(None) # simulate message receipt
|
||||||
|
|
||||||
|
def test_age_is_small_after_message(self):
|
||||||
|
self.assertLess(self.w.age_s(), 0.1)
|
||||||
|
|
||||||
|
def test_level_ok_immediately_after(self):
|
||||||
|
self.assertEqual(self.w.level(), OK)
|
||||||
|
|
||||||
|
def test_count_increments(self):
|
||||||
|
self.w.on_message(None)
|
||||||
|
self.assertEqual(self.w.msg_count(), 2)
|
||||||
|
|
||||||
|
def test_multiple_messages(self):
|
||||||
|
for _ in range(10):
|
||||||
|
self.w.on_message(None)
|
||||||
|
self.assertEqual(self.w.msg_count(), 11)
|
||||||
|
|
||||||
|
|
||||||
|
# ── SensorWatcher: staleness thresholds ──────────────────────────────────────
|
||||||
|
|
||||||
|
class TestSensorWatcherStaleness(unittest.TestCase):
|
||||||
|
|
||||||
|
def _make_stale(self, seconds_ago: float) -> SensorWatcher:
|
||||||
|
"""Return a watcher whose last_rx was `seconds_ago` seconds in the past."""
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
# Backdate the last_rx directly
|
||||||
|
with w._lock:
|
||||||
|
w._last_rx -= seconds_ago
|
||||||
|
return w
|
||||||
|
|
||||||
|
def test_ok_when_fresh(self):
|
||||||
|
w = self._make_stale(0.5)
|
||||||
|
self.assertEqual(w.level(), OK)
|
||||||
|
|
||||||
|
def test_warn_at_warn_threshold(self):
|
||||||
|
w = self._make_stale(1.1)
|
||||||
|
self.assertEqual(w.level(), WARN)
|
||||||
|
|
||||||
|
def test_error_at_error_threshold(self):
|
||||||
|
w = self._make_stale(3.1)
|
||||||
|
self.assertEqual(w.level(), ERROR)
|
||||||
|
|
||||||
|
def test_age_matches_backdated_time(self):
|
||||||
|
w = self._make_stale(2.0)
|
||||||
|
self.assertAlmostEqual(w.age_s(), 2.0, delta=0.1)
|
||||||
|
|
||||||
|
def test_warn_level_between_thresholds(self):
|
||||||
|
w = self._make_stale(2.0) # 1.0 < 2.0 < 3.0
|
||||||
|
self.assertEqual(w.level(), WARN)
|
||||||
|
|
||||||
|
|
||||||
|
# ── SensorWatcher: diagnostic_status output ──────────────────────────────────
|
||||||
|
|
||||||
|
class TestSensorWatcherDiagStatus(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_never_received_is_error(self):
|
||||||
|
w = SensorWatcher("/camera/color/image_raw", "camera_color",
|
||||||
|
warn_s=1.0, error_s=3.0, critical=True)
|
||||||
|
ds = w.diagnostic_status()
|
||||||
|
self.assertEqual(ds.level, ERROR)
|
||||||
|
self.assertIn("no data", ds.message)
|
||||||
|
|
||||||
|
def test_ok_status_message(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
ds = w.diagnostic_status()
|
||||||
|
self.assertEqual(ds.level, OK)
|
||||||
|
self.assertIn("OK", ds.message)
|
||||||
|
|
||||||
|
def test_warn_status_message(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
with w._lock:
|
||||||
|
w._last_rx -= 1.5
|
||||||
|
ds = w.diagnostic_status()
|
||||||
|
self.assertEqual(ds.level, WARN)
|
||||||
|
self.assertIn("WARN", ds.message)
|
||||||
|
|
||||||
|
def test_hardware_id_is_topic(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
ds = w.diagnostic_status()
|
||||||
|
self.assertEqual(ds.hardware_id, "/scan")
|
||||||
|
|
||||||
|
def test_kv_keys_present(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
ds = w.diagnostic_status()
|
||||||
|
kv_keys = {kv.key for kv in ds.values}
|
||||||
|
for expected in ("topic", "age_s", "rate_hz", "count", "warn_s", "error_s"):
|
||||||
|
self.assertIn(expected, kv_keys)
|
||||||
|
|
||||||
|
def test_age_inf_displayed_as_inf(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
ds = w.diagnostic_status()
|
||||||
|
kv = {kv.key: kv.value for kv in ds.values}
|
||||||
|
self.assertEqual(kv["age_s"], "inf")
|
||||||
|
|
||||||
|
|
||||||
|
# ── SensorWatcher: summary_dict output ───────────────────────────────────────
|
||||||
|
|
||||||
|
class TestSensorWatcherSummaryDict(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_never_received_age_is_none(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
d = w.summary_dict()
|
||||||
|
self.assertIsNone(d["age_s"])
|
||||||
|
self.assertEqual(d["status"], "ERROR")
|
||||||
|
|
||||||
|
def test_ok_status_string(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
d = w.summary_dict()
|
||||||
|
self.assertEqual(d["status"], "OK")
|
||||||
|
|
||||||
|
def test_warn_status_string(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
with w._lock:
|
||||||
|
w._last_rx -= 1.5
|
||||||
|
d = w.summary_dict()
|
||||||
|
self.assertEqual(d["status"], "WARN")
|
||||||
|
|
||||||
|
def test_error_status_string(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
with w._lock:
|
||||||
|
w._last_rx -= 5.0
|
||||||
|
d = w.summary_dict()
|
||||||
|
self.assertEqual(d["status"], "ERROR")
|
||||||
|
|
||||||
|
def test_age_rounded(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
d = w.summary_dict()
|
||||||
|
self.assertIsInstance(d["age_s"], float)
|
||||||
|
|
||||||
|
def test_critical_flag(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=True)
|
||||||
|
self.assertTrue(w.summary_dict()["critical"])
|
||||||
|
|
||||||
|
def test_non_critical_flag(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
self.assertFalse(w.summary_dict()["critical"])
|
||||||
|
|
||||||
|
def test_count_in_summary(self):
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
for _ in range(5):
|
||||||
|
w.on_message(None)
|
||||||
|
self.assertEqual(w.summary_dict()["count"], 5)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Level name mapping ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestLevelNames(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_ok_name(self):
|
||||||
|
self.assertEqual(_LEVEL_NAMES[OK], "OK")
|
||||||
|
|
||||||
|
def test_warn_name(self):
|
||||||
|
self.assertEqual(_LEVEL_NAMES[WARN], "WARN")
|
||||||
|
|
||||||
|
def test_error_name(self):
|
||||||
|
self.assertEqual(_LEVEL_NAMES[ERROR], "ERROR")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Thread safety ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestSensorWatcherThreadSafety(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_concurrent_messages(self):
|
||||||
|
import threading
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
N = 500
|
||||||
|
threads = [threading.Thread(target=w.on_message, args=(None,)) for _ in range(N)]
|
||||||
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
self.assertEqual(w.msg_count(), N)
|
||||||
|
|
||||||
|
def test_concurrent_reads(self):
|
||||||
|
import threading
|
||||||
|
w = SensorWatcher("/scan", "lidar", warn_s=1.0, error_s=3.0, critical=False)
|
||||||
|
w.on_message(None)
|
||||||
|
errors = []
|
||||||
|
def read_loop():
|
||||||
|
for _ in range(100):
|
||||||
|
try:
|
||||||
|
w.level()
|
||||||
|
w.age_s()
|
||||||
|
w.rate_hz()
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(e)
|
||||||
|
threads = [threading.Thread(target=read_loop) for _ in range(5)]
|
||||||
|
for t in threads: t.start()
|
||||||
|
for t in threads: t.join()
|
||||||
|
self.assertEqual(errors, [])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Loading…
x
Reference in New Issue
Block a user