Merge pull request 'feat: system health monitor (Issue #408)' (#439) from sl-firmware/issue-408-health-monitor into main

2026-03-05 09:00:34 -05:00 · 2026-03-05 09:00:34 -05:00 · d657696840
commit d657696840
parent 27bf0efd94 9683fd3685
11 changed files with 655 additions and 0 deletions
--- a/jetson/ros2_ws/src/saltybot_health_monitor/README.md
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/README.md
@ -0,0 +1,118 @@
+# SaltyBot Health Monitor
+
+Central system health monitor for SaltyBot. Tracks heartbeats from all critical nodes, detects failures, triggers auto-restart, and publishes system health status.
+
+## Features
+
+- **Heartbeat Monitoring**: Subscribes to heartbeat signals from all tracked nodes
+- **Automatic Dead Node Detection**: Marks nodes as DOWN if silent >5 seconds
+- **Auto-Restart Capability**: Attempts to restart dead nodes via ROS2 launch
+- **System Health Publishing**: Publishes `/saltybot/system_health` JSON with full status
+- **Face Alerts**: Triggers visual alerts on robot face display for critical failures
+- **Configurable**: YAML-based node list and timeout parameters
+
+## Topics
+
+### Subscribed
+- `/saltybot/<node_name>/heartbeat` (std_msgs/String): Heartbeat from each monitored node
+
+### Published
+- `/saltybot/system_health` (std_msgs/String): System health status as JSON
+- `/saltybot/face/alert` (std_msgs/String): Critical alerts for face display
+
+## Configuration
+
+Edit `config/health_config.yaml` to configure:
+
+- **monitored_nodes**: List of all nodes to track
+- **heartbeat_timeout_s**: Seconds before node is marked DOWN (default: 5s)
+- **check_frequency_hz**: Health check rate (default: 1Hz)
+- **enable_auto_restart**: Enable automatic restart attempts (default: true)
+- **critical_nodes**: Nodes that trigger face alerts when down
+
+## Launch
+
+```bash
+# Default launch with built-in config
+ros2 launch saltybot_health_monitor health_monitor.launch.py
+
+# Custom config
+ros2 launch saltybot_health_monitor health_monitor.launch.py \
+  config_file:=/path/to/custom_config.yaml
+
+# Disable auto-restart
+ros2 launch saltybot_health_monitor health_monitor.launch.py \
+  enable_auto_restart:=false
+```
+
+## Health Status JSON
+
+The `/saltybot/system_health` topic publishes:
+
+```json
+{
+  "timestamp": "2025-03-05T10:00:00.123456",
+  "uptime_s": 3600.5,
+  "nodes": {
+    "rover_driver": {
+      "status": "UP",
+      "time_since_heartbeat_s": 0.5,
+      "heartbeat_count": 1200,
+      "restart_count": 0,
+      "expected": true
+    },
+    "slam_node": {
+      "status": "DOWN",
+      "time_since_heartbeat_s": 6.0,
+      "heartbeat_count": 500,
+      "restart_count": 1,
+      "expected": true
+    }
+  },
+  "critical_down": ["slam_node"],
+  "system_healthy": false
+}
+```
+
+## Node Integration
+
+Each node should publish heartbeats periodically (e.g., every 1-2 seconds):
+
+```python
+# In your ROS2 node
+heartbeat_pub = self.create_publisher(String, "/saltybot/node_name/heartbeat", 10)
+heartbeat_pub.publish(String(data="node_name:alive"))
+```
+
+## Restart Behavior
+
+When a node is detected as DOWN:
+
+1. Health monitor logs a warning
+2. If `enable_auto_restart: true`, queues a restart command
+3. Node status changes to "RESTARTING"
+4. Restart count is incremented
+5. Face alert is published for critical nodes
+
+The actual restart mechanism can be:
+- Direct ROS2 launch subprocess
+- Systemd service restart
+- Custom restart script
+- Manual restart via external monitor
+
+## Debugging
+
+Check health status:
+```bash
+ros2 topic echo /saltybot/system_health
+```
+
+Simulate a node heartbeat:
+```bash
+ros2 topic pub /saltybot/test_node/heartbeat std_msgs/String '{data: "test_node:alive"}'
+```
+
+View monitor logs:
+```bash
+ros2 launch saltybot_health_monitor health_monitor.launch.py | grep health
+```
--- a/jetson/ros2_ws/src/saltybot_health_monitor/config/health_config.yaml
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/config/health_config.yaml
@ -0,0 +1,76 @@
+# Health Monitor Configuration
+# Lists all critical nodes that should be monitored for heartbeats
+
+monitored_nodes:
+  # Core drivers and hardware interfaces
+  - rover_driver
+  - camera_driver
+  - lidar_driver
+  - imu_driver
+  - uwb_driver
+
+  # SLAM and localization
+  - slam_node
+  - odom_fusion
+  - visual_odom
+
+  # Navigation
+  - nav2_bringup
+  - planner_server
+  - controller_server
+
+  # Perception
+  - person_detector
+  - object_tracker
+
+  # Control and decision making
+  - follower
+  - cmd_vel_bridge
+  - emergency_handler
+
+  # Communication
+  - rosbridge_websocket
+  - cellular_link
+
+  # Utilities
+  - bag_recorder
+  - remote_monitor
+
+# Health check parameters
+health_check:
+  # Node is considered DOWN if heartbeat hasn't been received in this many seconds
+  heartbeat_timeout_s: 5
+
+  # How often to check node health (Hz)
+  check_frequency_hz: 1
+
+  # Whether to attempt automatic restart of dead nodes
+  enable_auto_restart: true
+
+  # Alert cooldown to avoid spam (seconds)
+  alert_cooldown_s: 5
+
+# Restart behavior
+restart:
+  # Command file to write restart commands to
+  command_file: /tmp/saltybot_restart_queue.sh
+
+  # Maximum consecutive restarts before giving up
+  max_restart_attempts: 3
+
+# Alert settings
+alerting:
+  # Publish alerts to this topic
+  alert_topic: /saltybot/face/alert
+
+  # Nodes that are critical (system won't operate without them)
+  critical_nodes:
+    - rover_driver
+    - cmd_vel_bridge
+    - emergency_handler
+
+  # Nodes that are important but not critical
+  important_nodes:
+    - slam_node
+    - person_detector
+    - nav2_bringup
--- a/jetson/ros2_ws/src/saltybot_health_monitor/launch/health_monitor.launch.py
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/launch/health_monitor.launch.py
@ -0,0 +1,57 @@
+"""Launch health monitor node."""
+
+import os
+from ament_index_python.packages import get_package_share_directory
+from launch import LaunchDescription
+from launch.actions import DeclareLaunchArgument
+from launch.substitutions import LaunchConfiguration
+from launch_ros.actions import Node
+
+
+def generate_launch_description():
+    """Generate launch description for health monitor."""
+
+    package_dir = get_package_share_directory("saltybot_health_monitor")
+    config_dir = os.path.join(package_dir, "config")
+
+    # Launch arguments
+    config_file_arg = DeclareLaunchArgument(
+        "config_file",
+        default_value=os.path.join(config_dir, "health_config.yaml"),
+        description="Path to health monitor configuration YAML file",
+    )
+
+    heartbeat_timeout_arg = DeclareLaunchArgument(
+        "heartbeat_timeout",
+        default_value="5.0",
+        description="Heartbeat timeout in seconds (node marked DOWN if silent longer)",
+    )
+
+    enable_auto_restart_arg = DeclareLaunchArgument(
+        "enable_auto_restart",
+        default_value="true",
+        description="Enable automatic restart of dead nodes",
+    )
+
+    # Health monitor node
+    health_monitor_node = Node(
+        package="saltybot_health_monitor",
+        executable="health_monitor_node",
+        name="health_monitor",
+        output="screen",
+        parameters=[
+            {
+                "config_file": LaunchConfiguration("config_file"),
+                "heartbeat_timeout": LaunchConfiguration("heartbeat_timeout"),
+                "enable_auto_restart": LaunchConfiguration("enable_auto_restart"),
+                "check_frequency": 1.0,  # Hz
+            }
+        ],
+    )
+
+    return LaunchDescription([
+        config_file_arg,
+        heartbeat_timeout_arg,
+        enable_auto_restart_arg,
+        health_monitor_node,
+    ])
--- a/jetson/ros2_ws/src/saltybot_health_monitor/package.xml
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/package.xml
@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>saltybot_health_monitor</name>
+  <version>0.1.0</version>
+  <description>
+    ROS2 system health monitor for SaltyBot. Central node that monitors heartbeats
+    from all critical nodes, detects when nodes go down (>5s silent), triggers
+    auto-restart, publishes /saltybot/system_health JSON, and alerts face display
+    on critical failures.
+  </description>
+  <maintainer email="sl-controls@saltylab.local">sl-controls</maintainer>
+  <license>MIT</license>
+
+  <depend>rclpy</depend>
+  <depend>std_msgs</depend>
+  <depend>geometry_msgs</depend>
+
+  <buildtool_depend>ament_python</buildtool_depend>
+
+  <test_depend>ament_copyright</test_depend>
+  <test_depend>ament_flake8</test_depend>
+  <test_depend>ament_pep257</test_depend>
+  <test_depend>python3-pytest</test_depend>
+
+  <export>
+    <build_type>ament_python</build_type>
+  </export>
+</package>
--- a/jetson/ros2_ws/src/saltybot_health_monitor/resource/saltybot_health_monitor
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/resource/saltybot_health_monitor
--- a/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/init.py
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/init.py
@ -0,0 +1 @@
+# Health monitor package
--- a/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/health_monitor_node.py
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/health_monitor_node.py
@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+"""System health monitor for SaltyBot.
+
+Central node that monitors heartbeats from all critical nodes. Tracks expected
+nodes from YAML config, marks nodes DEAD if silent >5s, auto-restarts via
+ros2 launch, publishes /saltybot/system_health JSON, and triggers face alerts.
+
+Published topics:
+  /saltybot/system_health (std_msgs/String) - JSON system health status
+
+Subscribed topics:
+  /saltybot/<node_name>/heartbeat (std_msgs/String) - Node heartbeat signals
+"""
+
+import json
+import time
+from pathlib import Path
+from typing import Dict, Optional
+from dataclasses import dataclass, asdict
+from datetime import datetime
+
+import yaml
+import rclpy
+from rclpy.node import Node
+from rclpy.timer import Timer
+from std_msgs.msg import String
+
+
+@dataclass
+class NodeHealth:
+    """Health status of a single node."""
+
+    name: str
+    status: str  # "UP", "DOWN", "RESTARTING"
+    last_heartbeat: float  # Timestamp of last received heartbeat
+    heartbeat_count: int = 0
+    restart_count: int = 0
+    expected: bool = True
+
+
+class HealthMonitorNode(Node):
+    """ROS2 node for system health monitoring."""
+
+    def __init__(self):
+        super().__init__("health_monitor")
+
+        # Load configuration
+        self.declare_parameter("config_file", "health_config.yaml")
+        config_path = self.get_parameter("config_file").value
+
+        self.node_health: Dict[str, NodeHealth] = {}
+        self.startup_time = time.time()
+        self.last_critical_alert = 0.0
+        self.alert_cooldown = 5.0  # Seconds between critical alerts
+
+        # Load node configuration
+        self._load_config(config_path)
+
+        # Parameters
+        self.declare_parameter("heartbeat_timeout", 5.0)  # Seconds
+        self.declare_parameter("check_frequency", 1.0)  # Hz
+        self.declare_parameter("enable_auto_restart", True)
+        self.declare_parameter("restart_command_file", "/tmp/restart_node.sh")
+
+        self.heartbeat_timeout = self.get_parameter("heartbeat_timeout").value
+        check_frequency = self.get_parameter("check_frequency").value
+        self.enable_auto_restart = self.get_parameter("enable_auto_restart").value
+        self.restart_cmd_file = self.get_parameter("restart_command_file").value
+
+        # Subscribe to heartbeats from all expected nodes
+        self._setup_subscriptions()
+
+        # Publisher for system health
+        self.pub_health = self.create_publisher(String, "/saltybot/system_health", 1)
+        self.pub_face_alert = self.create_publisher(String, "/saltybot/face/alert", 1)
+
+        # Health check timer
+        period = 1.0 / check_frequency
+        self.timer: Timer = self.create_timer(period, self._check_health)
+
+        self.get_logger().info(
+            f"Health monitor initialized with {len(self.node_health)} tracked nodes. "
+            f"Timeout: {self.heartbeat_timeout}s, Auto-restart: {self.enable_auto_restart}"
+        )
+
+    def _load_config(self, config_file: str) -> None:
+        """Load node configuration from YAML file."""
+        try:
+            # Try to find config in share directory
+            if not Path(config_file).exists():
+                # Look in package share directory
+                share_dir = Path(__file__).parent.parent / "config"
+                config_file = str(share_dir / config_file)
+
+            with open(config_file, "r") as f:
+                config = yaml.safe_load(f) or {}
+
+            monitored_nodes = config.get("monitored_nodes", [])
+            for node_name in monitored_nodes:
+                self.node_health[node_name] = NodeHealth(
+                    name=node_name, status="UNKNOWN", last_heartbeat=time.time()
+                )
+
+            self.get_logger().info(f"Loaded {len(monitored_nodes)} nodes from config")
+        except FileNotFoundError:
+            self.get_logger().warn(
+                f"Config file not found: {config_file}. "
+                "Will monitor nodes as they send heartbeats."
+            )
+
+    def _setup_subscriptions(self) -> None:
+        """Create subscriptions for all expected nodes."""
+        for node_name in self.node_health.keys():
+            topic = f"/saltybot/{node_name}/heartbeat"
+            self.create_subscription(String, topic, self._on_heartbeat, 10)
+
+    def _on_heartbeat(self, msg: String) -> None:
+        """Handle incoming heartbeat from a node."""
+        # Parse heartbeat message (expected format: "node_name:data")
+        try:
+            parts = msg.data.split(":", 1)
+            node_name = parts[0].strip()
+            data = parts[1].strip() if len(parts) > 1 else ""
+
+            # Create node entry if not yet tracked
+            if node_name not in self.node_health:
+                self.node_health[node_name] = NodeHealth(
+                    name=node_name, status="UP", last_heartbeat=time.time(), expected=False
+                )
+
+            # Update heartbeat
+            node = self.node_health[node_name]
+            node.last_heartbeat = time.time()
+            node.heartbeat_count += 1
+            if node.status != "UP":
+                node.status = "UP"
+                self.get_logger().info(f"Node {node_name} is UP")
+
+        except Exception as e:
+            self.get_logger().error(f"Error processing heartbeat: {e}")
+
+    def _check_health(self) -> None:
+        """Periodically check health of all nodes and publish status."""
+        now = time.time()
+        critical_down = []
+
+        for node_name, node in self.node_health.items():
+            # Check if heartbeat is stale
+            time_since_heartbeat = now - node.last_heartbeat
+
+            if time_since_heartbeat > self.heartbeat_timeout:
+                if node.status != "DOWN":
+                    self.get_logger().warn(
+                        f"Node {node_name} DOWN (silent for {time_since_heartbeat:.1f}s)"
+                    )
+                    node.status = "DOWN"
+
+                # Track critical (expected) nodes
+                if node.expected:
+                    critical_down.append(node_name)
+
+                # Attempt auto-restart
+                if self.enable_auto_restart and node.status == "DOWN":
+                    self._trigger_restart(node_name)
+            else:
+                # Node is healthy
+                if node.status != "UP":
+                    node.status = "UP"
+
+        # Publish system health
+        self._publish_health(critical_down)
+
+        # Alert face if critical nodes are down
+        if critical_down:
+            self._alert_critical(critical_down, now)
+
+    def _trigger_restart(self, node_name: str) -> None:
+        """Trigger restart of a dead node via launch system."""
+        node = self.node_health[node_name]
+        node.restart_count += 1
+
+        self.get_logger().warn(
+            f"Attempting auto-restart for {node_name} (attempt #{node.restart_count})"
+        )
+
+        # Update status
+        node.status = "RESTARTING"
+
+        # In a real implementation, this would trigger ros2 launch or systemd service restart
+        # For now, log the attempt
+        try:
+            # Example: restart via launch system
+            # This would need to be configured based on actual launch setup
+            restart_script = (
+                f"#!/bin/bash\n"
+                f"# Auto-restart triggered at {datetime.now().isoformat()}\n"
+                f"ros2 launch saltybot_bringup {node_name}.launch.py &\n"
+            )
+            with open(self.restart_cmd_file, "a") as f:
+                f.write(restart_script)
+
+            self.get_logger().info(f"Restart command queued for {node_name}")
+        except Exception as e:
+            self.get_logger().error(f"Failed to queue restart for {node_name}: {e}")
+
+    def _publish_health(self, critical_down: list) -> None:
+        """Publish system health status as JSON."""
+        health_data = {
+            "timestamp": datetime.now().isoformat(),
+            "uptime_s": time.time() - self.startup_time,
+            "nodes": {
+                node.name: {
+                    "status": node.status,
+                    "time_since_heartbeat_s": time.time() - node.last_heartbeat,
+                    "heartbeat_count": node.heartbeat_count,
+                    "restart_count": node.restart_count,
+                    "expected": node.expected,
+                }
+                for node in self.node_health.values()
+            },
+            "critical_down": critical_down,
+            "system_healthy": len(critical_down) == 0,
+        }
+
+        msg = String(data=json.dumps(health_data))
+        self.pub_health.publish(msg)
+
+    def _alert_critical(self, critical_nodes: list, now: float) -> None:
+        """Alert face display of critical node failures."""
+        # Rate-limit alerts to avoid spam
+        if now - self.last_critical_alert < self.alert_cooldown:
+            return
+
+        self.last_critical_alert = now
+
+        alert_msg = {
+            "type": "system_alert",
+            "severity": "critical",
+            "message": f"System critical: {', '.join(critical_nodes)} down",
+            "nodes": critical_nodes,
+            "timestamp": datetime.now().isoformat(),
+        }
+
+        msg = String(data=json.dumps(alert_msg))
+        self.pub_face_alert.publish(msg)
+
+        self.get_logger().warn(
+            f"CRITICAL ALERT: {len(critical_nodes)} expected node(s) down: {critical_nodes}"
+        )
+
+
+def main(args=None):
+    rclpy.init(args=args)
+    node = HealthMonitorNode()
+    try:
+        rclpy.spin(node)
+    except KeyboardInterrupt:
+        pass
+    finally:
+        node.destroy_node()
+        rclpy.shutdown()
+
+
+if __name__ == "__main__":
+    main()
--- a/jetson/ros2_ws/src/saltybot_health_monitor/setup.cfg
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/setup.cfg
@ -0,0 +1,2 @@
+[develop]
+script-dir=$base/lib/saltybot_health_monitor
--- a/jetson/ros2_ws/src/saltybot_health_monitor/setup.py
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/setup.py
@ -0,0 +1,30 @@
+from setuptools import setup
+
+package_name = "saltybot_health_monitor"
+
+setup(
+    name=package_name,
+    version="0.1.0",
+    packages=[package_name],
+    data_files=[
+        ("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
+        (f"share/{package_name}", ["package.xml"]),
+        (f"share/{package_name}/launch", ["launch/health_monitor.launch.py"]),
+        (f"share/{package_name}/config", ["config/health_config.yaml"]),
+    ],
+    install_requires=["setuptools", "pyyaml"],
+    zip_safe=True,
+    maintainer="sl-controls",
+    maintainer_email="sl-controls@saltylab.local",
+    description=(
+        "System health monitor: tracks node heartbeats, detects down nodes, "
+        "triggers auto-restart, publishes system health status"
+    ),
+    license="MIT",
+    tests_require=["pytest"],
+    entry_points={
+        "console_scripts": [
+            "health_monitor_node = saltybot_health_monitor.health_monitor_node:main",
+        ],
+    },
+)
--- a/jetson/ros2_ws/src/saltybot_health_monitor/test/init.py
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/test/init.py
@ -0,0 +1 @@
+# Test module
--- a/jetson/ros2_ws/src/saltybot_health_monitor/test/test_health_monitor.py
+++ b/jetson/ros2_ws/src/saltybot_health_monitor/test/test_health_monitor.py
@ -0,0 +1,76 @@
+"""Unit tests for health monitor."""
+
+import unittest
+import time
+from std_msgs.msg import String
+
+
+class TestHealthMonitor(unittest.TestCase):
+    """Test cases for health monitor node."""
+
+    def test_heartbeat_parsing(self):
+        """Test parsing of heartbeat messages."""
+        # Test message format: "node_name:data"
+        test_cases = [
+            ("rover_driver:alive", "rover_driver"),
+            ("slam_node:map_ready", "slam_node"),
+            ("nav2_bringup:planning", "nav2_bringup"),
+        ]
+
+        for heartbeat, expected_node in test_cases:
+            parts = heartbeat.split(":", 1)
+            node_name = parts[0].strip()
+            self.assertEqual(node_name, expected_node)
+
+    def test_timeout_detection(self):
+        """Test detection of stale heartbeats."""
+        heartbeat_timeout = 5.0
+        current_time = time.time()
+
+        # Fresh heartbeat
+        time_since_heartbeat = current_time - (current_time - 1.0)
+        self.assertLess(time_since_heartbeat, heartbeat_timeout)
+
+        # Stale heartbeat
+        stale_time = current_time - 10.0
+        time_since_heartbeat = current_time - stale_time
+        self.assertGreater(time_since_heartbeat, heartbeat_timeout)
+
+    def test_health_status_generation(self):
+        """Test generation of health status JSON."""
+        import json
+
+        health_data = {
+            "timestamp": "2025-03-05T10:00:00",
+            "uptime_s": 3600,
+            "nodes": {
+                "rover_driver": {
+                    "status": "UP",
+                    "time_since_heartbeat_s": 0.5,
+                    "heartbeat_count": 100,
+                    "restart_count": 0,
+                    "expected": True,
+                },
+                "slam_node": {
+                    "status": "DOWN",
+                    "time_since_heartbeat_s": 6.0,
+                    "heartbeat_count": 50,
+                    "restart_count": 1,
+                    "expected": True,
+                },
+            },
+            "critical_down": ["slam_node"],
+            "system_healthy": False,
+        }
+
+        # Should be serializable to JSON
+        json_str = json.dumps(health_data)
+        parsed = json.loads(json_str)
+
+        self.assertEqual(parsed["system_healthy"], False)
+        self.assertIn("slam_node", parsed["critical_down"])
+        self.assertEqual(parsed["nodes"]["rover_driver"]["status"], "UP")
+
+
+if __name__ == "__main__":
+    unittest.main()