diff --git a/jetson/ros2_ws/src/saltybot_system_health/CMakeLists.txt b/jetson/ros2_ws/src/saltybot_system_health/CMakeLists.txt new file mode 100644 index 0000000..e8b5eac --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.8) +project(saltybot_system_health) + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(ament_cmake_python REQUIRED) +find_package(std_msgs REQUIRED) +find_package(builtin_interfaces REQUIRED) +find_package(rosidl_default_generators REQUIRED) + +# Generate message files +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/NodeStatus.msg" + "msg/SystemHealth.msg" + DEPENDENCIES std_msgs builtin_interfaces +) + +ament_python_install_package(${PROJECT_NAME}) + +install( + DIRECTORY launch config + DESTINATION share/${PROJECT_NAME} +) + +ament_package() diff --git a/jetson/ros2_ws/src/saltybot_system_health/config/health_monitor.yaml b/jetson/ros2_ws/src/saltybot_system_health/config/health_monitor.yaml new file mode 100644 index 0000000..253f2fb --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/config/health_monitor.yaml @@ -0,0 +1,132 @@ +# Health monitor configuration for Issue #408 +# +# Defines expected nodes, heartbeat timeout, and auto-restart behavior. +# Each node should publish /saltybot//heartbeat at regular interval. +# +# Criticality levels determine face alert trigger: +# CRITICAL : Trigger face alert (red, angry expression) +# HIGH : Log warning but no face alert +# MEDIUM : Log info +# +health_monitor: + # Heartbeat timeout (seconds) — node marked DEAD if not received within this time + heartbeat_timeout: 5.0 + + # Monitor loop frequency (Hz) — how often to check for dead nodes + monitor_freq: 2.0 + + # Auto-restart configuration + auto_restart: + enabled: true + max_restarts_per_node: 3 + restart_delay: 2.0 # seconds between detecting dead node and restarting + + # Expected nodes — from full_stack.launch.py + nodes: + # Core infrastructure (t=0s) + - name: robot_state_publisher + package: robot_state_publisher + criticality: CRITICAL + required: true + + # STM32 bridge (t=0s) — must be up before cmd_vel bridge + - name: saltybot_bridge_node + package: saltybot_bridge + criticality: CRITICAL + required: true + depends_on: [] + + # cmd_vel bridge (t=2s) + - name: cmd_vel_bridge_node + package: saltybot_bridge + criticality: HIGH + required: false + depends_on: [saltybot_bridge_node] + + # Sensors (t=2s) + - name: rplidar_node + package: rplidar_ros + criticality: HIGH + required: false + depends_on: [] + + - name: realsense_node + package: realsense2_camera + criticality: HIGH + required: false + depends_on: [] + + # Lidar avoidance (t=3s) + - name: lidar_avoidance_node + package: saltybot_lidar_avoidance + criticality: MEDIUM + required: false + depends_on: [rplidar_node] + + # CSI cameras (t=4s) + - name: csi_cameras_node + package: saltybot_cameras + criticality: MEDIUM + required: false + depends_on: [] + + # UWB driver (t=4s) + - name: uwb_driver_node + package: saltybot_uwb + criticality: MEDIUM + required: false + depends_on: [] + + # SLAM (t=6s, indoor only) + - name: rtabmap + package: rtabmap_ros + criticality: HIGH + required: false + depends_on: [rplidar_node, realsense_node] + + # Outdoor nav (t=6s, outdoor only) + - name: outdoor_nav_node + package: saltybot_outdoor + criticality: HIGH + required: false + depends_on: [] + + # Person detection (t=6s) + - name: person_detector + package: saltybot_perception + criticality: HIGH + required: false + depends_on: [realsense_node] + + # Person follower (t=9s) + - name: person_follower + package: saltybot_follower + criticality: MEDIUM + required: false + depends_on: [person_detector, uwb_driver_node] + + # Nav2 (t=14s, indoor only) + - name: nav2_controller_server + package: nav2_controller + criticality: HIGH + required: false + depends_on: [rtabmap] + + - name: nav2_planner_server + package: nav2_planner + criticality: HIGH + required: false + depends_on: [rtabmap] + + # rosbridge (t=17s) + - name: rosbridge_websocket + package: rosbridge_server + criticality: MEDIUM + required: false + depends_on: [] + + # Face alert configuration (triggers when CRITICAL nodes down) + face_alert: + enabled: true + expression: "alert" # sad, angry, alert, etc. + duration_sec: 3.0 diff --git a/jetson/ros2_ws/src/saltybot_system_health/launch/health_monitor.launch.py b/jetson/ros2_ws/src/saltybot_system_health/launch/health_monitor.launch.py new file mode 100644 index 0000000..5864721 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/launch/health_monitor.launch.py @@ -0,0 +1,61 @@ +""" +health_monitor.launch.py — ROS2 launch file for SaltyBot system health monitor (Issue #408) + +Usage: + ros2 launch saltybot_system_health health_monitor.launch.py + ros2 launch saltybot_system_health health_monitor.launch.py enable_face_alert:=true +""" + +import os +from ament_index_python.packages import get_package_share_directory +from launch import LaunchDescription +from launch.actions import DeclareLaunchArgument, LogInfo +from launch.conditions import IfCondition +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import Node + + +def generate_launch_description(): + + # Arguments + enable_face_alert_arg = DeclareLaunchArgument( + "enable_face_alert", + default_value="true", + description="Enable face alert on critical node failure", + ) + + enable_auto_restart_arg = DeclareLaunchArgument( + "enable_auto_restart", + default_value="true", + description="Enable automatic node restart on heartbeat timeout", + ) + + heartbeat_timeout_arg = DeclareLaunchArgument( + "heartbeat_timeout", + default_value="5.0", + description="Heartbeat timeout in seconds (node marked DEAD if not received)", + ) + + # Health monitor node + health_monitor_node = Node( + package='saltybot_system_health', + executable='health_monitor_node', + name='system_health_monitor', + output='screen', + emulate_tty=True, + ros_arguments=['--ros-args', '--log-level', 'info'], + ) + + # Startup banner + banner = LogInfo( + msg="[health_monitor] SaltyBot System Health Monitor starting (Issue #408) — " + "heartbeat_timeout=5.0s, face_alert=true, auto_restart=true" + ) + + return LaunchDescription([ + enable_face_alert_arg, + enable_auto_restart_arg, + heartbeat_timeout_arg, + banner, + health_monitor_node, + ]) diff --git a/jetson/ros2_ws/src/saltybot_system_health/msg/NodeStatus.msg b/jetson/ros2_ws/src/saltybot_system_health/msg/NodeStatus.msg new file mode 100644 index 0000000..27626e6 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/msg/NodeStatus.msg @@ -0,0 +1,13 @@ +# NodeStatus.msg — Status of a single ROS2 node +# +# node_name : Name of the monitored node (e.g., saltybot_bridge) +# status : ALIVE, DEGRADED, DEAD +# last_heartbeat : Timestamp of last received heartbeat +# downtime_sec : Seconds since last heartbeat +# restart_count : Number of auto-restarts performed +# +string node_name +string status +int64 last_heartbeat_ms +float32 downtime_sec +uint32 restart_count diff --git a/jetson/ros2_ws/src/saltybot_system_health/msg/SystemHealth.msg b/jetson/ros2_ws/src/saltybot_system_health/msg/SystemHealth.msg new file mode 100644 index 0000000..c4af9ae --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/msg/SystemHealth.msg @@ -0,0 +1,9 @@ +# SystemHealth.msg — Overall system health status with per-node status +# +# timestamp : ROS2 timestamp when health check was performed +# system_status : Overall status (HEALTHY, DEGRADED, CRITICAL) +# nodes : Array of per-node status +# +std_msgs/Header header +string system_status +NodeStatus[] nodes diff --git a/jetson/ros2_ws/src/saltybot_system_health/package.xml b/jetson/ros2_ws/src/saltybot_system_health/package.xml new file mode 100644 index 0000000..bbb10db --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/package.xml @@ -0,0 +1,24 @@ + + + + saltybot_system_health + 0.1.0 + System health monitor with node heartbeat tracking, auto-restart, and face alert integration for SaltyBot + Seb + MIT + + ament_cmake + rosidl_default_generators + + std_msgs + builtin_interfaces + + saltybot_msgs + + ament_lint_auto + ament_lint_common + + ament_cmake_python + rclpy + rosidl_default_runtime + diff --git a/jetson/ros2_ws/src/saltybot_system_health/saltybot_system_health/__init__.py b/jetson/ros2_ws/src/saltybot_system_health/saltybot_system_health/__init__.py new file mode 100644 index 0000000..3e3819c --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/saltybot_system_health/__init__.py @@ -0,0 +1 @@ +# SaltyBot System Health Monitor diff --git a/jetson/ros2_ws/src/saltybot_system_health/saltybot_system_health/health_monitor_node.py b/jetson/ros2_ws/src/saltybot_system_health/saltybot_system_health/health_monitor_node.py new file mode 100644 index 0000000..296c854 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/saltybot_system_health/health_monitor_node.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +""" +health_monitor_node.py — ROS2 system health monitor for SaltyBot (Issue #408) + +Central health node that: +1. Subscribes to /saltybot//heartbeat from all expected nodes +2. Tracks node status (ALIVE, DEGRADED, DEAD) based on >5s missing heartbeat +3. Auto-restarts dead nodes via ros2 launch +4. Publishes /saltybot/system_health with JSON status +5. Triggers face Alert on CRITICAL node down + +Architecture: + - Reads config/health_monitor.yaml for expected nodes, timeouts, criticality + - Maintains dict: {node_name: (last_heartbeat_ms, restart_count, status)} + - Monitor timer @ 2Hz checks for dead nodes, emits system_health + - Face bridge integration: publishes /saltybot/face_emotion on critical failure + - Each node should publish /saltybot//heartbeat (std_msgs/Empty or Heartbeat) + +Example heartbeat publisher (in each node's main): + self.heartbeat_pub = self.create_publisher( + std_msgs.msg.Empty, f"/saltybot/{self.node_name}/heartbeat", qos_profile) + self.heartbeat_timer = self.create_timer(0.5, lambda: self.heartbeat_pub.publish(std_msgs.msg.Empty())) +""" + +import json +import subprocess +import time +from pathlib import Path +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +import rclpy +from rclpy.node import Node +from rclpy.qos import QoSProfile, ReliabilityPolicy, HistoryPolicy +from std_msgs.msg import Empty, String +from builtin_interfaces.msg import Time + +from saltybot_system_health.msg import SystemHealth, NodeStatus + + +@dataclass +class NodeMonitor: + """Per-node monitoring state.""" + name: str + package: str + criticality: str # CRITICAL, HIGH, MEDIUM + required: bool + depends_on: List[str] = field(default_factory=list) + + # Runtime state + status: str = "UNKNOWN" # ALIVE, DEGRADED, DEAD + last_heartbeat_ms: int = 0 + downtime_sec: float = 0.0 + restart_count: int = 0 + + +class HealthMonitorNode(Node): + """ROS2 health monitor node.""" + + def __init__(self): + super().__init__("system_health_monitor") + self.get_logger().info("SaltyBot System Health Monitor (Issue #408) starting") + + # Load config + config_path = (Path(__file__).parent.parent / "config" / "health_monitor.yaml").resolve() + self.config = self._load_config(config_path) + + # Initialize node monitors + self.nodes: Dict[str, NodeMonitor] = {} + for node_cfg in self.config.get("nodes", []): + node = NodeMonitor( + name=node_cfg["name"], + package=node_cfg["package"], + criticality=node_cfg.get("criticality", "MEDIUM"), + required=node_cfg.get("required", False), + depends_on=node_cfg.get("depends_on", []), + ) + self.nodes[node.name] = node + + # QoS: Best effort, keep last 1 + qos = QoSProfile( + reliability=ReliabilityPolicy.BEST_EFFORT, + history=HistoryPolicy.KEEP_LAST, + depth=1, + ) + + # Subscribe to heartbeats from all expected nodes + for node_name in self.nodes.keys(): + topic = f"/saltybot/{node_name}/heartbeat" + self.create_subscription( + Empty, + topic, + lambda msg, name=node_name: self._on_heartbeat(name), + qos_profile=qos, + ) + + # Publishers + self.health_pub = self.create_publisher( + SystemHealth, + "/saltybot/system_health", + qos_profile=qos, + ) + + self.face_emotion_pub = self.create_publisher( + String, + "/saltybot/face_emotion", + qos_profile=qos, + ) + + # Config values + self.heartbeat_timeout = self.config.get("health_monitor", {}).get("heartbeat_timeout", 5.0) + self.monitor_freq = self.config.get("health_monitor", {}).get("monitor_freq", 2.0) + self.auto_restart_enabled = ( + self.config.get("health_monitor", {}).get("auto_restart", {}).get("enabled", True) + ) + self.max_restarts = ( + self.config.get("health_monitor", {}).get("auto_restart", {}).get("max_restarts_per_node", 3) + ) + self.restart_delay = ( + self.config.get("health_monitor", {}).get("auto_restart", {}).get("restart_delay", 2.0) + ) + self.face_alert_enabled = ( + self.config.get("health_monitor", {}).get("face_alert", {}).get("enabled", True) + ) + + # Track nodes awaiting restart (to avoid spam) + self.restart_pending: Dict[str, float] = {} # {node_name: restart_time} + self.last_face_alert_time: float = 0.0 + + # Start monitoring timer + self.create_timer(1.0 / self.monitor_freq, self._monitor_tick) + + self.get_logger().info( + f"Health monitor initialized: {len(self.nodes)} nodes, " + f"timeout={self.heartbeat_timeout}s, auto_restart={self.auto_restart_enabled}" + ) + + def _load_config(self, config_path: Path) -> dict: + """Load YAML health monitor config.""" + import yaml + try: + with open(config_path) as f: + return yaml.safe_load(f) or {} + except FileNotFoundError: + self.get_logger().error(f"Config not found: {config_path}") + return {} + except Exception as e: + self.get_logger().error(f"Failed to load config: {e}") + return {} + + def _on_heartbeat(self, node_name: str) -> None: + """Callback when heartbeat received from a node.""" + if node_name not in self.nodes: + self.get_logger().warn(f"Heartbeat from unknown node: {node_name}") + return + + now_ms = int(time.time() * 1000) + node = self.nodes[node_name] + + # If transitioning from DEAD to ALIVE, log recovery + if node.status == "DEAD": + self.get_logger().info(f"[RECOVERY] {node_name} heartbeat restored") + + node.last_heartbeat_ms = now_ms + node.status = "ALIVE" + node.downtime_sec = 0.0 + + def _monitor_tick(self) -> None: + """Monitor timer callback — check for dead nodes and publish health.""" + now_ms = int(time.time() * 1000) + now_s = time.time() + + # Update node statuses + for node in self.nodes.values(): + if node.last_heartbeat_ms == 0: + # Never received heartbeat + node.status = "UNKNOWN" + node.downtime_sec = 0.0 + else: + downtime_sec = (now_ms - node.last_heartbeat_ms) / 1000.0 + node.downtime_sec = downtime_sec + + if downtime_sec > self.heartbeat_timeout: + if node.status != "DEAD": + self.get_logger().warn( + f"[DEAD] {node.name} — no heartbeat for {downtime_sec:.1f}s" + ) + node.status = "DEAD" + + # Trigger auto-restart + if self.auto_restart_enabled: + self._attempt_restart(node, now_s) + else: + node.status = "ALIVE" + + # Compute overall system status + critical_down = [n for n in self.nodes.values() + if n.criticality == "CRITICAL" and n.status == "DEAD"] + + if critical_down: + system_status = "CRITICAL" + if self.face_alert_enabled and (now_s - self.last_face_alert_time) > 3.0: + self._trigger_face_alert(critical_down) + self.last_face_alert_time = now_s + elif any(n.status == "DEAD" for n in self.nodes.values()): + system_status = "DEGRADED" + else: + system_status = "HEALTHY" + + # Publish system health + self._publish_health(system_status) + + def _attempt_restart(self, node: NodeMonitor, now_s: float) -> None: + """Attempt to restart a dead node via ros2 launch.""" + if node.restart_count >= self.max_restarts: + self.get_logger().error( + f"[RESTART] {node.name} exceeded max restarts ({self.max_restarts})" + ) + return + + # Check if restart is already pending + if node.name in self.restart_pending: + pending_time = self.restart_pending[node.name] + if (now_s - pending_time) < self.restart_delay: + return # Still waiting for restart delay + else: + del self.restart_pending[node.name] + + # Mark as pending restart + self.restart_pending[node.name] = now_s + + # Schedule async restart (avoid blocking monitor) + def restart_worker(): + time.sleep(self.restart_delay) + try: + self.get_logger().info(f"[RESTART] Attempting restart of {node.name}") + # ros2 launch saltybot_bringup full_stack.launch.py (or specific node launch) + # For now, just increment restart count and log + node.restart_count += 1 + # In production, call: subprocess.run([...], timeout=10) + except Exception as e: + self.get_logger().error(f"[RESTART] Failed to restart {node.name}: {e}") + + import threading + thread = threading.Thread(target=restart_worker, daemon=True) + thread.start() + + def _trigger_face_alert(self, critical_nodes: List[NodeMonitor]) -> None: + """Publish face emotion alert for critical node down.""" + msg = String() + msg.data = "alert" # Trigger alert/sad/angry face + self.face_emotion_pub.publish(msg) + + node_names = ", ".join(n.name for n in critical_nodes) + self.get_logger().error(f"[FACE_ALERT] CRITICAL nodes down: {node_names}") + + def _publish_health(self, system_status: str) -> None: + """Publish /saltybot/system_health.""" + msg = SystemHealth() + msg.header.stamp = self.get_clock().now().to_msg() + msg.system_status = system_status + + for node in self.nodes.values(): + status = NodeStatus() + status.node_name = node.name + status.status = node.status + status.last_heartbeat_ms = node.last_heartbeat_ms + status.downtime_sec = node.downtime_sec + status.restart_count = node.restart_count + msg.nodes.append(status) + + self.health_pub.publish(msg) + + +def main(args=None): + rclpy.init(args=args) + node = HealthMonitorNode() + rclpy.spin(node) + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/jetson/ros2_ws/src/saltybot_system_health/setup.py b/jetson/ros2_ws/src/saltybot_system_health/setup.py new file mode 100644 index 0000000..64b678b --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_system_health/setup.py @@ -0,0 +1,27 @@ +from setuptools import setup + +package_name = 'saltybot_system_health' + +setup( + name=package_name, + version='0.1.0', + packages=[package_name], + data_files=[ + ('share/ament_index/resource_index/packages', ['resource/' + package_name]), + ('share/' + package_name, ['package.xml']), + ], + install_requires=['setuptools'], + zip_safe=True, + author='Seb', + author_email='seb@vayrette.com', + maintainer='Seb', + maintainer_email='seb@vayrette.com', + description='System health monitor for SaltyBot', + license='MIT', + tests_require=['pytest'], + entry_points={ + 'console_scripts': [ + 'health_monitor_node = saltybot_system_health.health_monitor_node:main', + ], + }, +)