diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/README.md b/jetson/ros2_ws/src/saltybot_health_monitor/README.md new file mode 100644 index 0000000..54bd888 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/README.md @@ -0,0 +1,118 @@ +# SaltyBot Health Monitor + +Central system health monitor for SaltyBot. Tracks heartbeats from all critical nodes, detects failures, triggers auto-restart, and publishes system health status. + +## Features + +- **Heartbeat Monitoring**: Subscribes to heartbeat signals from all tracked nodes +- **Automatic Dead Node Detection**: Marks nodes as DOWN if silent >5 seconds +- **Auto-Restart Capability**: Attempts to restart dead nodes via ROS2 launch +- **System Health Publishing**: Publishes `/saltybot/system_health` JSON with full status +- **Face Alerts**: Triggers visual alerts on robot face display for critical failures +- **Configurable**: YAML-based node list and timeout parameters + +## Topics + +### Subscribed +- `/saltybot//heartbeat` (std_msgs/String): Heartbeat from each monitored node + +### Published +- `/saltybot/system_health` (std_msgs/String): System health status as JSON +- `/saltybot/face/alert` (std_msgs/String): Critical alerts for face display + +## Configuration + +Edit `config/health_config.yaml` to configure: + +- **monitored_nodes**: List of all nodes to track +- **heartbeat_timeout_s**: Seconds before node is marked DOWN (default: 5s) +- **check_frequency_hz**: Health check rate (default: 1Hz) +- **enable_auto_restart**: Enable automatic restart attempts (default: true) +- **critical_nodes**: Nodes that trigger face alerts when down + +## Launch + +```bash +# Default launch with built-in config +ros2 launch saltybot_health_monitor health_monitor.launch.py + +# Custom config +ros2 launch saltybot_health_monitor health_monitor.launch.py \ + config_file:=/path/to/custom_config.yaml + +# Disable auto-restart +ros2 launch saltybot_health_monitor health_monitor.launch.py \ + enable_auto_restart:=false +``` + +## Health Status JSON + +The `/saltybot/system_health` topic publishes: + +```json +{ + "timestamp": "2025-03-05T10:00:00.123456", + "uptime_s": 3600.5, + "nodes": { + "rover_driver": { + "status": "UP", + "time_since_heartbeat_s": 0.5, + "heartbeat_count": 1200, + "restart_count": 0, + "expected": true + }, + "slam_node": { + "status": "DOWN", + "time_since_heartbeat_s": 6.0, + "heartbeat_count": 500, + "restart_count": 1, + "expected": true + } + }, + "critical_down": ["slam_node"], + "system_healthy": false +} +``` + +## Node Integration + +Each node should publish heartbeats periodically (e.g., every 1-2 seconds): + +```python +# In your ROS2 node +heartbeat_pub = self.create_publisher(String, "/saltybot/node_name/heartbeat", 10) +heartbeat_pub.publish(String(data="node_name:alive")) +``` + +## Restart Behavior + +When a node is detected as DOWN: + +1. Health monitor logs a warning +2. If `enable_auto_restart: true`, queues a restart command +3. Node status changes to "RESTARTING" +4. Restart count is incremented +5. Face alert is published for critical nodes + +The actual restart mechanism can be: +- Direct ROS2 launch subprocess +- Systemd service restart +- Custom restart script +- Manual restart via external monitor + +## Debugging + +Check health status: +```bash +ros2 topic echo /saltybot/system_health +``` + +Simulate a node heartbeat: +```bash +ros2 topic pub /saltybot/test_node/heartbeat std_msgs/String '{data: "test_node:alive"}' +``` + +View monitor logs: +```bash +ros2 launch saltybot_health_monitor health_monitor.launch.py | grep health +``` diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/config/health_config.yaml b/jetson/ros2_ws/src/saltybot_health_monitor/config/health_config.yaml new file mode 100644 index 0000000..2551e81 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/config/health_config.yaml @@ -0,0 +1,76 @@ +# Health Monitor Configuration +# Lists all critical nodes that should be monitored for heartbeats + +monitored_nodes: + # Core drivers and hardware interfaces + - rover_driver + - camera_driver + - lidar_driver + - imu_driver + - uwb_driver + + # SLAM and localization + - slam_node + - odom_fusion + - visual_odom + + # Navigation + - nav2_bringup + - planner_server + - controller_server + + # Perception + - person_detector + - object_tracker + + # Control and decision making + - follower + - cmd_vel_bridge + - emergency_handler + + # Communication + - rosbridge_websocket + - cellular_link + + # Utilities + - bag_recorder + - remote_monitor + +# Health check parameters +health_check: + # Node is considered DOWN if heartbeat hasn't been received in this many seconds + heartbeat_timeout_s: 5 + + # How often to check node health (Hz) + check_frequency_hz: 1 + + # Whether to attempt automatic restart of dead nodes + enable_auto_restart: true + + # Alert cooldown to avoid spam (seconds) + alert_cooldown_s: 5 + +# Restart behavior +restart: + # Command file to write restart commands to + command_file: /tmp/saltybot_restart_queue.sh + + # Maximum consecutive restarts before giving up + max_restart_attempts: 3 + +# Alert settings +alerting: + # Publish alerts to this topic + alert_topic: /saltybot/face/alert + + # Nodes that are critical (system won't operate without them) + critical_nodes: + - rover_driver + - cmd_vel_bridge + - emergency_handler + + # Nodes that are important but not critical + important_nodes: + - slam_node + - person_detector + - nav2_bringup diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/launch/health_monitor.launch.py b/jetson/ros2_ws/src/saltybot_health_monitor/launch/health_monitor.launch.py new file mode 100644 index 0000000..2588bba --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/launch/health_monitor.launch.py @@ -0,0 +1,57 @@ +"""Launch health monitor node.""" + +import os +from ament_index_python.packages import get_package_share_directory +from launch import LaunchDescription +from launch.actions import DeclareLaunchArgument +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import Node + + +def generate_launch_description(): + """Generate launch description for health monitor.""" + + package_dir = get_package_share_directory("saltybot_health_monitor") + config_dir = os.path.join(package_dir, "config") + + # Launch arguments + config_file_arg = DeclareLaunchArgument( + "config_file", + default_value=os.path.join(config_dir, "health_config.yaml"), + description="Path to health monitor configuration YAML file", + ) + + heartbeat_timeout_arg = DeclareLaunchArgument( + "heartbeat_timeout", + default_value="5.0", + description="Heartbeat timeout in seconds (node marked DOWN if silent longer)", + ) + + enable_auto_restart_arg = DeclareLaunchArgument( + "enable_auto_restart", + default_value="true", + description="Enable automatic restart of dead nodes", + ) + + # Health monitor node + health_monitor_node = Node( + package="saltybot_health_monitor", + executable="health_monitor_node", + name="health_monitor", + output="screen", + parameters=[ + { + "config_file": LaunchConfiguration("config_file"), + "heartbeat_timeout": LaunchConfiguration("heartbeat_timeout"), + "enable_auto_restart": LaunchConfiguration("enable_auto_restart"), + "check_frequency": 1.0, # Hz + } + ], + ) + + return LaunchDescription([ + config_file_arg, + heartbeat_timeout_arg, + enable_auto_restart_arg, + health_monitor_node, + ]) diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/package.xml b/jetson/ros2_ws/src/saltybot_health_monitor/package.xml new file mode 100644 index 0000000..b49a494 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/package.xml @@ -0,0 +1,29 @@ + + + + saltybot_health_monitor + 0.1.0 + + ROS2 system health monitor for SaltyBot. Central node that monitors heartbeats + from all critical nodes, detects when nodes go down (>5s silent), triggers + auto-restart, publishes /saltybot/system_health JSON, and alerts face display + on critical failures. + + sl-controls + MIT + + rclpy + std_msgs + geometry_msgs + + ament_python + + ament_copyright + ament_flake8 + ament_pep257 + python3-pytest + + + ament_python + + diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/resource/saltybot_health_monitor b/jetson/ros2_ws/src/saltybot_health_monitor/resource/saltybot_health_monitor new file mode 100644 index 0000000..e69de29 diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/__init__.py b/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/__init__.py new file mode 100644 index 0000000..cf7fcbc --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/__init__.py @@ -0,0 +1 @@ +# Health monitor package diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/health_monitor_node.py b/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/health_monitor_node.py new file mode 100644 index 0000000..9d42ea0 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/saltybot_health_monitor/health_monitor_node.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +"""System health monitor for SaltyBot. + +Central node that monitors heartbeats from all critical nodes. Tracks expected +nodes from YAML config, marks nodes DEAD if silent >5s, auto-restarts via +ros2 launch, publishes /saltybot/system_health JSON, and triggers face alerts. + +Published topics: + /saltybot/system_health (std_msgs/String) - JSON system health status + +Subscribed topics: + /saltybot//heartbeat (std_msgs/String) - Node heartbeat signals +""" + +import json +import time +from pathlib import Path +from typing import Dict, Optional +from dataclasses import dataclass, asdict +from datetime import datetime + +import yaml +import rclpy +from rclpy.node import Node +from rclpy.timer import Timer +from std_msgs.msg import String + + +@dataclass +class NodeHealth: + """Health status of a single node.""" + + name: str + status: str # "UP", "DOWN", "RESTARTING" + last_heartbeat: float # Timestamp of last received heartbeat + heartbeat_count: int = 0 + restart_count: int = 0 + expected: bool = True + + +class HealthMonitorNode(Node): + """ROS2 node for system health monitoring.""" + + def __init__(self): + super().__init__("health_monitor") + + # Load configuration + self.declare_parameter("config_file", "health_config.yaml") + config_path = self.get_parameter("config_file").value + + self.node_health: Dict[str, NodeHealth] = {} + self.startup_time = time.time() + self.last_critical_alert = 0.0 + self.alert_cooldown = 5.0 # Seconds between critical alerts + + # Load node configuration + self._load_config(config_path) + + # Parameters + self.declare_parameter("heartbeat_timeout", 5.0) # Seconds + self.declare_parameter("check_frequency", 1.0) # Hz + self.declare_parameter("enable_auto_restart", True) + self.declare_parameter("restart_command_file", "/tmp/restart_node.sh") + + self.heartbeat_timeout = self.get_parameter("heartbeat_timeout").value + check_frequency = self.get_parameter("check_frequency").value + self.enable_auto_restart = self.get_parameter("enable_auto_restart").value + self.restart_cmd_file = self.get_parameter("restart_command_file").value + + # Subscribe to heartbeats from all expected nodes + self._setup_subscriptions() + + # Publisher for system health + self.pub_health = self.create_publisher(String, "/saltybot/system_health", 1) + self.pub_face_alert = self.create_publisher(String, "/saltybot/face/alert", 1) + + # Health check timer + period = 1.0 / check_frequency + self.timer: Timer = self.create_timer(period, self._check_health) + + self.get_logger().info( + f"Health monitor initialized with {len(self.node_health)} tracked nodes. " + f"Timeout: {self.heartbeat_timeout}s, Auto-restart: {self.enable_auto_restart}" + ) + + def _load_config(self, config_file: str) -> None: + """Load node configuration from YAML file.""" + try: + # Try to find config in share directory + if not Path(config_file).exists(): + # Look in package share directory + share_dir = Path(__file__).parent.parent / "config" + config_file = str(share_dir / config_file) + + with open(config_file, "r") as f: + config = yaml.safe_load(f) or {} + + monitored_nodes = config.get("monitored_nodes", []) + for node_name in monitored_nodes: + self.node_health[node_name] = NodeHealth( + name=node_name, status="UNKNOWN", last_heartbeat=time.time() + ) + + self.get_logger().info(f"Loaded {len(monitored_nodes)} nodes from config") + except FileNotFoundError: + self.get_logger().warn( + f"Config file not found: {config_file}. " + "Will monitor nodes as they send heartbeats." + ) + + def _setup_subscriptions(self) -> None: + """Create subscriptions for all expected nodes.""" + for node_name in self.node_health.keys(): + topic = f"/saltybot/{node_name}/heartbeat" + self.create_subscription(String, topic, self._on_heartbeat, 10) + + def _on_heartbeat(self, msg: String) -> None: + """Handle incoming heartbeat from a node.""" + # Parse heartbeat message (expected format: "node_name:data") + try: + parts = msg.data.split(":", 1) + node_name = parts[0].strip() + data = parts[1].strip() if len(parts) > 1 else "" + + # Create node entry if not yet tracked + if node_name not in self.node_health: + self.node_health[node_name] = NodeHealth( + name=node_name, status="UP", last_heartbeat=time.time(), expected=False + ) + + # Update heartbeat + node = self.node_health[node_name] + node.last_heartbeat = time.time() + node.heartbeat_count += 1 + if node.status != "UP": + node.status = "UP" + self.get_logger().info(f"Node {node_name} is UP") + + except Exception as e: + self.get_logger().error(f"Error processing heartbeat: {e}") + + def _check_health(self) -> None: + """Periodically check health of all nodes and publish status.""" + now = time.time() + critical_down = [] + + for node_name, node in self.node_health.items(): + # Check if heartbeat is stale + time_since_heartbeat = now - node.last_heartbeat + + if time_since_heartbeat > self.heartbeat_timeout: + if node.status != "DOWN": + self.get_logger().warn( + f"Node {node_name} DOWN (silent for {time_since_heartbeat:.1f}s)" + ) + node.status = "DOWN" + + # Track critical (expected) nodes + if node.expected: + critical_down.append(node_name) + + # Attempt auto-restart + if self.enable_auto_restart and node.status == "DOWN": + self._trigger_restart(node_name) + else: + # Node is healthy + if node.status != "UP": + node.status = "UP" + + # Publish system health + self._publish_health(critical_down) + + # Alert face if critical nodes are down + if critical_down: + self._alert_critical(critical_down, now) + + def _trigger_restart(self, node_name: str) -> None: + """Trigger restart of a dead node via launch system.""" + node = self.node_health[node_name] + node.restart_count += 1 + + self.get_logger().warn( + f"Attempting auto-restart for {node_name} (attempt #{node.restart_count})" + ) + + # Update status + node.status = "RESTARTING" + + # In a real implementation, this would trigger ros2 launch or systemd service restart + # For now, log the attempt + try: + # Example: restart via launch system + # This would need to be configured based on actual launch setup + restart_script = ( + f"#!/bin/bash\n" + f"# Auto-restart triggered at {datetime.now().isoformat()}\n" + f"ros2 launch saltybot_bringup {node_name}.launch.py &\n" + ) + with open(self.restart_cmd_file, "a") as f: + f.write(restart_script) + + self.get_logger().info(f"Restart command queued for {node_name}") + except Exception as e: + self.get_logger().error(f"Failed to queue restart for {node_name}: {e}") + + def _publish_health(self, critical_down: list) -> None: + """Publish system health status as JSON.""" + health_data = { + "timestamp": datetime.now().isoformat(), + "uptime_s": time.time() - self.startup_time, + "nodes": { + node.name: { + "status": node.status, + "time_since_heartbeat_s": time.time() - node.last_heartbeat, + "heartbeat_count": node.heartbeat_count, + "restart_count": node.restart_count, + "expected": node.expected, + } + for node in self.node_health.values() + }, + "critical_down": critical_down, + "system_healthy": len(critical_down) == 0, + } + + msg = String(data=json.dumps(health_data)) + self.pub_health.publish(msg) + + def _alert_critical(self, critical_nodes: list, now: float) -> None: + """Alert face display of critical node failures.""" + # Rate-limit alerts to avoid spam + if now - self.last_critical_alert < self.alert_cooldown: + return + + self.last_critical_alert = now + + alert_msg = { + "type": "system_alert", + "severity": "critical", + "message": f"System critical: {', '.join(critical_nodes)} down", + "nodes": critical_nodes, + "timestamp": datetime.now().isoformat(), + } + + msg = String(data=json.dumps(alert_msg)) + self.pub_face_alert.publish(msg) + + self.get_logger().warn( + f"CRITICAL ALERT: {len(critical_nodes)} expected node(s) down: {critical_nodes}" + ) + + +def main(args=None): + rclpy.init(args=args) + node = HealthMonitorNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/setup.cfg b/jetson/ros2_ws/src/saltybot_health_monitor/setup.cfg new file mode 100644 index 0000000..11def3c --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/setup.cfg @@ -0,0 +1,2 @@ +[develop] +script-dir=$base/lib/saltybot_health_monitor diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/setup.py b/jetson/ros2_ws/src/saltybot_health_monitor/setup.py new file mode 100644 index 0000000..1b08bbb --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/setup.py @@ -0,0 +1,30 @@ +from setuptools import setup + +package_name = "saltybot_health_monitor" + +setup( + name=package_name, + version="0.1.0", + packages=[package_name], + data_files=[ + ("share/ament_index/resource_index/packages", [f"resource/{package_name}"]), + (f"share/{package_name}", ["package.xml"]), + (f"share/{package_name}/launch", ["launch/health_monitor.launch.py"]), + (f"share/{package_name}/config", ["config/health_config.yaml"]), + ], + install_requires=["setuptools", "pyyaml"], + zip_safe=True, + maintainer="sl-controls", + maintainer_email="sl-controls@saltylab.local", + description=( + "System health monitor: tracks node heartbeats, detects down nodes, " + "triggers auto-restart, publishes system health status" + ), + license="MIT", + tests_require=["pytest"], + entry_points={ + "console_scripts": [ + "health_monitor_node = saltybot_health_monitor.health_monitor_node:main", + ], + }, +) diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/test/__init__.py b/jetson/ros2_ws/src/saltybot_health_monitor/test/__init__.py new file mode 100644 index 0000000..5b59ffe --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/test/__init__.py @@ -0,0 +1 @@ +# Test module diff --git a/jetson/ros2_ws/src/saltybot_health_monitor/test/test_health_monitor.py b/jetson/ros2_ws/src/saltybot_health_monitor/test/test_health_monitor.py new file mode 100644 index 0000000..56cf7d4 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_health_monitor/test/test_health_monitor.py @@ -0,0 +1,76 @@ +"""Unit tests for health monitor.""" + +import unittest +import time +from std_msgs.msg import String + + +class TestHealthMonitor(unittest.TestCase): + """Test cases for health monitor node.""" + + def test_heartbeat_parsing(self): + """Test parsing of heartbeat messages.""" + # Test message format: "node_name:data" + test_cases = [ + ("rover_driver:alive", "rover_driver"), + ("slam_node:map_ready", "slam_node"), + ("nav2_bringup:planning", "nav2_bringup"), + ] + + for heartbeat, expected_node in test_cases: + parts = heartbeat.split(":", 1) + node_name = parts[0].strip() + self.assertEqual(node_name, expected_node) + + def test_timeout_detection(self): + """Test detection of stale heartbeats.""" + heartbeat_timeout = 5.0 + current_time = time.time() + + # Fresh heartbeat + time_since_heartbeat = current_time - (current_time - 1.0) + self.assertLess(time_since_heartbeat, heartbeat_timeout) + + # Stale heartbeat + stale_time = current_time - 10.0 + time_since_heartbeat = current_time - stale_time + self.assertGreater(time_since_heartbeat, heartbeat_timeout) + + def test_health_status_generation(self): + """Test generation of health status JSON.""" + import json + + health_data = { + "timestamp": "2025-03-05T10:00:00", + "uptime_s": 3600, + "nodes": { + "rover_driver": { + "status": "UP", + "time_since_heartbeat_s": 0.5, + "heartbeat_count": 100, + "restart_count": 0, + "expected": True, + }, + "slam_node": { + "status": "DOWN", + "time_since_heartbeat_s": 6.0, + "heartbeat_count": 50, + "restart_count": 1, + "expected": True, + }, + }, + "critical_down": ["slam_node"], + "system_healthy": False, + } + + # Should be serializable to JSON + json_str = json.dumps(health_data) + parsed = json.loads(json_str) + + self.assertEqual(parsed["system_healthy"], False) + self.assertIn("slam_node", parsed["critical_down"]) + self.assertEqual(parsed["nodes"]["rover_driver"]["status"], "UP") + + +if __name__ == "__main__": + unittest.main()