Merge pull request 'feat: system health monitor (Issue #408)' (#439) from sl-firmware/issue-408-health-monitor into main

This commit is contained in:
sl-jetson 2026-03-05 09:00:34 -05:00
commit d657696840
11 changed files with 655 additions and 0 deletions

View File

@ -0,0 +1,118 @@
# SaltyBot Health Monitor
Central system health monitor for SaltyBot. Tracks heartbeats from all critical nodes, detects failures, triggers auto-restart, and publishes system health status.
## Features
- **Heartbeat Monitoring**: Subscribes to heartbeat signals from all tracked nodes
- **Automatic Dead Node Detection**: Marks nodes as DOWN if silent >5 seconds
- **Auto-Restart Capability**: Attempts to restart dead nodes via ROS2 launch
- **System Health Publishing**: Publishes `/saltybot/system_health` JSON with full status
- **Face Alerts**: Triggers visual alerts on robot face display for critical failures
- **Configurable**: YAML-based node list and timeout parameters
## Topics
### Subscribed
- `/saltybot/<node_name>/heartbeat` (std_msgs/String): Heartbeat from each monitored node
### Published
- `/saltybot/system_health` (std_msgs/String): System health status as JSON
- `/saltybot/face/alert` (std_msgs/String): Critical alerts for face display
## Configuration
Edit `config/health_config.yaml` to configure:
- **monitored_nodes**: List of all nodes to track
- **heartbeat_timeout_s**: Seconds before node is marked DOWN (default: 5s)
- **check_frequency_hz**: Health check rate (default: 1Hz)
- **enable_auto_restart**: Enable automatic restart attempts (default: true)
- **critical_nodes**: Nodes that trigger face alerts when down
## Launch
```bash
# Default launch with built-in config
ros2 launch saltybot_health_monitor health_monitor.launch.py
# Custom config
ros2 launch saltybot_health_monitor health_monitor.launch.py \
config_file:=/path/to/custom_config.yaml
# Disable auto-restart
ros2 launch saltybot_health_monitor health_monitor.launch.py \
enable_auto_restart:=false
```
## Health Status JSON
The `/saltybot/system_health` topic publishes:
```json
{
"timestamp": "2025-03-05T10:00:00.123456",
"uptime_s": 3600.5,
"nodes": {
"rover_driver": {
"status": "UP",
"time_since_heartbeat_s": 0.5,
"heartbeat_count": 1200,
"restart_count": 0,
"expected": true
},
"slam_node": {
"status": "DOWN",
"time_since_heartbeat_s": 6.0,
"heartbeat_count": 500,
"restart_count": 1,
"expected": true
}
},
"critical_down": ["slam_node"],
"system_healthy": false
}
```
## Node Integration
Each node should publish heartbeats periodically (e.g., every 1-2 seconds):
```python
# In your ROS2 node
heartbeat_pub = self.create_publisher(String, "/saltybot/node_name/heartbeat", 10)
heartbeat_pub.publish(String(data="node_name:alive"))
```
## Restart Behavior
When a node is detected as DOWN:
1. Health monitor logs a warning
2. If `enable_auto_restart: true`, queues a restart command
3. Node status changes to "RESTARTING"
4. Restart count is incremented
5. Face alert is published for critical nodes
The actual restart mechanism can be:
- Direct ROS2 launch subprocess
- Systemd service restart
- Custom restart script
- Manual restart via external monitor
## Debugging
Check health status:
```bash
ros2 topic echo /saltybot/system_health
```
Simulate a node heartbeat:
```bash
ros2 topic pub /saltybot/test_node/heartbeat std_msgs/String '{data: "test_node:alive"}'
```
View monitor logs:
```bash
ros2 launch saltybot_health_monitor health_monitor.launch.py | grep health
```

View File

@ -0,0 +1,76 @@
# Health Monitor Configuration
# Lists all critical nodes that should be monitored for heartbeats
monitored_nodes:
# Core drivers and hardware interfaces
- rover_driver
- camera_driver
- lidar_driver
- imu_driver
- uwb_driver
# SLAM and localization
- slam_node
- odom_fusion
- visual_odom
# Navigation
- nav2_bringup
- planner_server
- controller_server
# Perception
- person_detector
- object_tracker
# Control and decision making
- follower
- cmd_vel_bridge
- emergency_handler
# Communication
- rosbridge_websocket
- cellular_link
# Utilities
- bag_recorder
- remote_monitor
# Health check parameters
health_check:
# Node is considered DOWN if heartbeat hasn't been received in this many seconds
heartbeat_timeout_s: 5
# How often to check node health (Hz)
check_frequency_hz: 1
# Whether to attempt automatic restart of dead nodes
enable_auto_restart: true
# Alert cooldown to avoid spam (seconds)
alert_cooldown_s: 5
# Restart behavior
restart:
# Command file to write restart commands to
command_file: /tmp/saltybot_restart_queue.sh
# Maximum consecutive restarts before giving up
max_restart_attempts: 3
# Alert settings
alerting:
# Publish alerts to this topic
alert_topic: /saltybot/face/alert
# Nodes that are critical (system won't operate without them)
critical_nodes:
- rover_driver
- cmd_vel_bridge
- emergency_handler
# Nodes that are important but not critical
important_nodes:
- slam_node
- person_detector
- nav2_bringup

View File

@ -0,0 +1,57 @@
"""Launch health monitor node."""
import os
from ament_index_python.packages import get_package_share_directory
from launch import LaunchDescription
from launch.actions import DeclareLaunchArgument
from launch.substitutions import LaunchConfiguration
from launch_ros.actions import Node
def generate_launch_description():
"""Generate launch description for health monitor."""
package_dir = get_package_share_directory("saltybot_health_monitor")
config_dir = os.path.join(package_dir, "config")
# Launch arguments
config_file_arg = DeclareLaunchArgument(
"config_file",
default_value=os.path.join(config_dir, "health_config.yaml"),
description="Path to health monitor configuration YAML file",
)
heartbeat_timeout_arg = DeclareLaunchArgument(
"heartbeat_timeout",
default_value="5.0",
description="Heartbeat timeout in seconds (node marked DOWN if silent longer)",
)
enable_auto_restart_arg = DeclareLaunchArgument(
"enable_auto_restart",
default_value="true",
description="Enable automatic restart of dead nodes",
)
# Health monitor node
health_monitor_node = Node(
package="saltybot_health_monitor",
executable="health_monitor_node",
name="health_monitor",
output="screen",
parameters=[
{
"config_file": LaunchConfiguration("config_file"),
"heartbeat_timeout": LaunchConfiguration("heartbeat_timeout"),
"enable_auto_restart": LaunchConfiguration("enable_auto_restart"),
"check_frequency": 1.0, # Hz
}
],
)
return LaunchDescription([
config_file_arg,
heartbeat_timeout_arg,
enable_auto_restart_arg,
health_monitor_node,
])

View File

@ -0,0 +1,29 @@
<?xml version="1.0"?>
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
<package format="3">
<name>saltybot_health_monitor</name>
<version>0.1.0</version>
<description>
ROS2 system health monitor for SaltyBot. Central node that monitors heartbeats
from all critical nodes, detects when nodes go down (>5s silent), triggers
auto-restart, publishes /saltybot/system_health JSON, and alerts face display
on critical failures.
</description>
<maintainer email="sl-controls@saltylab.local">sl-controls</maintainer>
<license>MIT</license>
<depend>rclpy</depend>
<depend>std_msgs</depend>
<depend>geometry_msgs</depend>
<buildtool_depend>ament_python</buildtool_depend>
<test_depend>ament_copyright</test_depend>
<test_depend>ament_flake8</test_depend>
<test_depend>ament_pep257</test_depend>
<test_depend>python3-pytest</test_depend>
<export>
<build_type>ament_python</build_type>
</export>
</package>

View File

@ -0,0 +1 @@
# Health monitor package

View File

@ -0,0 +1,265 @@
#!/usr/bin/env python3
"""System health monitor for SaltyBot.
Central node that monitors heartbeats from all critical nodes. Tracks expected
nodes from YAML config, marks nodes DEAD if silent >5s, auto-restarts via
ros2 launch, publishes /saltybot/system_health JSON, and triggers face alerts.
Published topics:
/saltybot/system_health (std_msgs/String) - JSON system health status
Subscribed topics:
/saltybot/<node_name>/heartbeat (std_msgs/String) - Node heartbeat signals
"""
import json
import time
from pathlib import Path
from typing import Dict, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
import yaml
import rclpy
from rclpy.node import Node
from rclpy.timer import Timer
from std_msgs.msg import String
@dataclass
class NodeHealth:
"""Health status of a single node."""
name: str
status: str # "UP", "DOWN", "RESTARTING"
last_heartbeat: float # Timestamp of last received heartbeat
heartbeat_count: int = 0
restart_count: int = 0
expected: bool = True
class HealthMonitorNode(Node):
"""ROS2 node for system health monitoring."""
def __init__(self):
super().__init__("health_monitor")
# Load configuration
self.declare_parameter("config_file", "health_config.yaml")
config_path = self.get_parameter("config_file").value
self.node_health: Dict[str, NodeHealth] = {}
self.startup_time = time.time()
self.last_critical_alert = 0.0
self.alert_cooldown = 5.0 # Seconds between critical alerts
# Load node configuration
self._load_config(config_path)
# Parameters
self.declare_parameter("heartbeat_timeout", 5.0) # Seconds
self.declare_parameter("check_frequency", 1.0) # Hz
self.declare_parameter("enable_auto_restart", True)
self.declare_parameter("restart_command_file", "/tmp/restart_node.sh")
self.heartbeat_timeout = self.get_parameter("heartbeat_timeout").value
check_frequency = self.get_parameter("check_frequency").value
self.enable_auto_restart = self.get_parameter("enable_auto_restart").value
self.restart_cmd_file = self.get_parameter("restart_command_file").value
# Subscribe to heartbeats from all expected nodes
self._setup_subscriptions()
# Publisher for system health
self.pub_health = self.create_publisher(String, "/saltybot/system_health", 1)
self.pub_face_alert = self.create_publisher(String, "/saltybot/face/alert", 1)
# Health check timer
period = 1.0 / check_frequency
self.timer: Timer = self.create_timer(period, self._check_health)
self.get_logger().info(
f"Health monitor initialized with {len(self.node_health)} tracked nodes. "
f"Timeout: {self.heartbeat_timeout}s, Auto-restart: {self.enable_auto_restart}"
)
def _load_config(self, config_file: str) -> None:
"""Load node configuration from YAML file."""
try:
# Try to find config in share directory
if not Path(config_file).exists():
# Look in package share directory
share_dir = Path(__file__).parent.parent / "config"
config_file = str(share_dir / config_file)
with open(config_file, "r") as f:
config = yaml.safe_load(f) or {}
monitored_nodes = config.get("monitored_nodes", [])
for node_name in monitored_nodes:
self.node_health[node_name] = NodeHealth(
name=node_name, status="UNKNOWN", last_heartbeat=time.time()
)
self.get_logger().info(f"Loaded {len(monitored_nodes)} nodes from config")
except FileNotFoundError:
self.get_logger().warn(
f"Config file not found: {config_file}. "
"Will monitor nodes as they send heartbeats."
)
def _setup_subscriptions(self) -> None:
"""Create subscriptions for all expected nodes."""
for node_name in self.node_health.keys():
topic = f"/saltybot/{node_name}/heartbeat"
self.create_subscription(String, topic, self._on_heartbeat, 10)
def _on_heartbeat(self, msg: String) -> None:
"""Handle incoming heartbeat from a node."""
# Parse heartbeat message (expected format: "node_name:data")
try:
parts = msg.data.split(":", 1)
node_name = parts[0].strip()
data = parts[1].strip() if len(parts) > 1 else ""
# Create node entry if not yet tracked
if node_name not in self.node_health:
self.node_health[node_name] = NodeHealth(
name=node_name, status="UP", last_heartbeat=time.time(), expected=False
)
# Update heartbeat
node = self.node_health[node_name]
node.last_heartbeat = time.time()
node.heartbeat_count += 1
if node.status != "UP":
node.status = "UP"
self.get_logger().info(f"Node {node_name} is UP")
except Exception as e:
self.get_logger().error(f"Error processing heartbeat: {e}")
def _check_health(self) -> None:
"""Periodically check health of all nodes and publish status."""
now = time.time()
critical_down = []
for node_name, node in self.node_health.items():
# Check if heartbeat is stale
time_since_heartbeat = now - node.last_heartbeat
if time_since_heartbeat > self.heartbeat_timeout:
if node.status != "DOWN":
self.get_logger().warn(
f"Node {node_name} DOWN (silent for {time_since_heartbeat:.1f}s)"
)
node.status = "DOWN"
# Track critical (expected) nodes
if node.expected:
critical_down.append(node_name)
# Attempt auto-restart
if self.enable_auto_restart and node.status == "DOWN":
self._trigger_restart(node_name)
else:
# Node is healthy
if node.status != "UP":
node.status = "UP"
# Publish system health
self._publish_health(critical_down)
# Alert face if critical nodes are down
if critical_down:
self._alert_critical(critical_down, now)
def _trigger_restart(self, node_name: str) -> None:
"""Trigger restart of a dead node via launch system."""
node = self.node_health[node_name]
node.restart_count += 1
self.get_logger().warn(
f"Attempting auto-restart for {node_name} (attempt #{node.restart_count})"
)
# Update status
node.status = "RESTARTING"
# In a real implementation, this would trigger ros2 launch or systemd service restart
# For now, log the attempt
try:
# Example: restart via launch system
# This would need to be configured based on actual launch setup
restart_script = (
f"#!/bin/bash\n"
f"# Auto-restart triggered at {datetime.now().isoformat()}\n"
f"ros2 launch saltybot_bringup {node_name}.launch.py &\n"
)
with open(self.restart_cmd_file, "a") as f:
f.write(restart_script)
self.get_logger().info(f"Restart command queued for {node_name}")
except Exception as e:
self.get_logger().error(f"Failed to queue restart for {node_name}: {e}")
def _publish_health(self, critical_down: list) -> None:
"""Publish system health status as JSON."""
health_data = {
"timestamp": datetime.now().isoformat(),
"uptime_s": time.time() - self.startup_time,
"nodes": {
node.name: {
"status": node.status,
"time_since_heartbeat_s": time.time() - node.last_heartbeat,
"heartbeat_count": node.heartbeat_count,
"restart_count": node.restart_count,
"expected": node.expected,
}
for node in self.node_health.values()
},
"critical_down": critical_down,
"system_healthy": len(critical_down) == 0,
}
msg = String(data=json.dumps(health_data))
self.pub_health.publish(msg)
def _alert_critical(self, critical_nodes: list, now: float) -> None:
"""Alert face display of critical node failures."""
# Rate-limit alerts to avoid spam
if now - self.last_critical_alert < self.alert_cooldown:
return
self.last_critical_alert = now
alert_msg = {
"type": "system_alert",
"severity": "critical",
"message": f"System critical: {', '.join(critical_nodes)} down",
"nodes": critical_nodes,
"timestamp": datetime.now().isoformat(),
}
msg = String(data=json.dumps(alert_msg))
self.pub_face_alert.publish(msg)
self.get_logger().warn(
f"CRITICAL ALERT: {len(critical_nodes)} expected node(s) down: {critical_nodes}"
)
def main(args=None):
rclpy.init(args=args)
node = HealthMonitorNode()
try:
rclpy.spin(node)
except KeyboardInterrupt:
pass
finally:
node.destroy_node()
rclpy.shutdown()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,2 @@
[develop]
script-dir=$base/lib/saltybot_health_monitor

View File

@ -0,0 +1,30 @@
from setuptools import setup
package_name = "saltybot_health_monitor"
setup(
name=package_name,
version="0.1.0",
packages=[package_name],
data_files=[
("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
(f"share/{package_name}", ["package.xml"]),
(f"share/{package_name}/launch", ["launch/health_monitor.launch.py"]),
(f"share/{package_name}/config", ["config/health_config.yaml"]),
],
install_requires=["setuptools", "pyyaml"],
zip_safe=True,
maintainer="sl-controls",
maintainer_email="sl-controls@saltylab.local",
description=(
"System health monitor: tracks node heartbeats, detects down nodes, "
"triggers auto-restart, publishes system health status"
),
license="MIT",
tests_require=["pytest"],
entry_points={
"console_scripts": [
"health_monitor_node = saltybot_health_monitor.health_monitor_node:main",
],
},
)

View File

@ -0,0 +1 @@
# Test module

View File

@ -0,0 +1,76 @@
"""Unit tests for health monitor."""
import unittest
import time
from std_msgs.msg import String
class TestHealthMonitor(unittest.TestCase):
"""Test cases for health monitor node."""
def test_heartbeat_parsing(self):
"""Test parsing of heartbeat messages."""
# Test message format: "node_name:data"
test_cases = [
("rover_driver:alive", "rover_driver"),
("slam_node:map_ready", "slam_node"),
("nav2_bringup:planning", "nav2_bringup"),
]
for heartbeat, expected_node in test_cases:
parts = heartbeat.split(":", 1)
node_name = parts[0].strip()
self.assertEqual(node_name, expected_node)
def test_timeout_detection(self):
"""Test detection of stale heartbeats."""
heartbeat_timeout = 5.0
current_time = time.time()
# Fresh heartbeat
time_since_heartbeat = current_time - (current_time - 1.0)
self.assertLess(time_since_heartbeat, heartbeat_timeout)
# Stale heartbeat
stale_time = current_time - 10.0
time_since_heartbeat = current_time - stale_time
self.assertGreater(time_since_heartbeat, heartbeat_timeout)
def test_health_status_generation(self):
"""Test generation of health status JSON."""
import json
health_data = {
"timestamp": "2025-03-05T10:00:00",
"uptime_s": 3600,
"nodes": {
"rover_driver": {
"status": "UP",
"time_since_heartbeat_s": 0.5,
"heartbeat_count": 100,
"restart_count": 0,
"expected": True,
},
"slam_node": {
"status": "DOWN",
"time_since_heartbeat_s": 6.0,
"heartbeat_count": 50,
"restart_count": 1,
"expected": True,
},
},
"critical_down": ["slam_node"],
"system_healthy": False,
}
# Should be serializable to JSON
json_str = json.dumps(health_data)
parsed = json.loads(json_str)
self.assertEqual(parsed["system_healthy"], False)
self.assertIn("slam_node", parsed["critical_down"])
self.assertEqual(parsed["nodes"]["rover_driver"]["status"], "UP")
if __name__ == "__main__":
unittest.main()