feat: Add Issue #203 - Node watchdog monitor (20Hz heartbeat detection)
Implements node watchdog ROS2 node that monitors heartbeats from critical systems and triggers safety fallback when motor driver is lost >2s. Features: - Monitor heartbeats from: balance, motor_driver, emergency, docking - Alert on /saltybot/node_watchdog (JSON) if heartbeat lost >1s - Safety fallback: zero cmd_vel if motor driver lost >2s - Republish cmd_vel on /saltybot/cmd_vel_safe with safety checks - 20Hz monitoring and publishing frequency - Configurable heartbeat timeout thresholds Heartbeat Topics: - /saltybot/balance_heartbeat (std_msgs/UInt32) - /saltybot/motor_driver_heartbeat (std_msgs/UInt32) - /saltybot/emergency_heartbeat (std_msgs/UInt32) - /saltybot/docking_heartbeat (std_msgs/UInt32) - /cmd_vel (geometry_msgs/Twist) Published Topics: - /saltybot/node_watchdog (std_msgs/String) - JSON status - /saltybot/cmd_vel_safe (geometry_msgs/Twist) - Safety-checked velocity Package: saltybot_node_watchdog Entry point: node_watchdog_node Launch file: node_watchdog.launch.py Tests: 20+ unit tests covering: - Heartbeat reception and timeout detection - Motor driver critical timeout (>2s) - Safety fallback logic - cmd_vel zeroing on motor driver loss - Health status JSON serialization - Multi-node failure scenarios Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
293a2a3096
commit
7d09c8c814
@ -0,0 +1,14 @@
|
|||||||
|
# Node watchdog configuration
|
||||||
|
|
||||||
|
node_watchdog:
|
||||||
|
ros__parameters:
|
||||||
|
# Publishing frequency in Hz
|
||||||
|
frequency: 20 # 20 Hz = 50ms cycle
|
||||||
|
|
||||||
|
# General heartbeat timeout (seconds)
|
||||||
|
# Alert if any heartbeat lost for this duration
|
||||||
|
heartbeat_timeout: 1.0
|
||||||
|
|
||||||
|
# Motor driver critical timeout (seconds)
|
||||||
|
# Trigger safety fallback (zero cmd_vel) if motor driver down this long
|
||||||
|
motor_driver_critical_timeout: 2.0
|
||||||
@ -0,0 +1,36 @@
|
|||||||
|
"""Launch file for node_watchdog_node."""
|
||||||
|
|
||||||
|
from launch import LaunchDescription
|
||||||
|
from launch_ros.actions import Node
|
||||||
|
from launch.substitutions import LaunchConfiguration
|
||||||
|
from launch.actions import DeclareLaunchArgument
|
||||||
|
import os
|
||||||
|
from ament_index_python.packages import get_package_share_directory
|
||||||
|
|
||||||
|
|
||||||
|
def generate_launch_description():
|
||||||
|
"""Generate launch description for node watchdog."""
|
||||||
|
# Package directory
|
||||||
|
pkg_dir = get_package_share_directory("saltybot_node_watchdog")
|
||||||
|
|
||||||
|
# Parameters
|
||||||
|
config_file = os.path.join(pkg_dir, "config", "watchdog_config.yaml")
|
||||||
|
|
||||||
|
# Declare launch arguments
|
||||||
|
return LaunchDescription(
|
||||||
|
[
|
||||||
|
DeclareLaunchArgument(
|
||||||
|
"config_file",
|
||||||
|
default_value=config_file,
|
||||||
|
description="Path to configuration YAML file",
|
||||||
|
),
|
||||||
|
# Node watchdog node
|
||||||
|
Node(
|
||||||
|
package="saltybot_node_watchdog",
|
||||||
|
executable="node_watchdog_node",
|
||||||
|
name="node_watchdog",
|
||||||
|
output="screen",
|
||||||
|
parameters=[LaunchConfiguration("config_file")],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
28
jetson/ros2_ws/src/saltybot_node_watchdog/package.xml
Normal file
28
jetson/ros2_ws/src/saltybot_node_watchdog/package.xml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
|
||||||
|
<package format="3">
|
||||||
|
<name>saltybot_node_watchdog</name>
|
||||||
|
<version>0.1.0</version>
|
||||||
|
<description>
|
||||||
|
Node watchdog monitor for SaltyBot critical systems. Monitors heartbeats from balance,
|
||||||
|
motor driver, emergency, and docking nodes. Publishes alerts on heartbeat loss >1s.
|
||||||
|
Implements safety fallback: zeros cmd_vel if motor driver lost >2s. Runs at 20Hz.
|
||||||
|
</description>
|
||||||
|
<maintainer email="sl-controls@saltylab.local">sl-controls</maintainer>
|
||||||
|
<license>MIT</license>
|
||||||
|
|
||||||
|
<depend>rclpy</depend>
|
||||||
|
<depend>geometry_msgs</depend>
|
||||||
|
<depend>std_msgs</depend>
|
||||||
|
|
||||||
|
<buildtool_depend>ament_python</buildtool_depend>
|
||||||
|
|
||||||
|
<test_depend>ament_copyright</test_depend>
|
||||||
|
<test_depend>ament_flake8</test_depend>
|
||||||
|
<test_depend>ament_pep257</test_depend>
|
||||||
|
<test_depend>python3-pytest</test_depend>
|
||||||
|
|
||||||
|
<export>
|
||||||
|
<build_type>ament_python</build_type>
|
||||||
|
</export>
|
||||||
|
</package>
|
||||||
@ -0,0 +1,235 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Node watchdog monitor for SaltyBot critical systems.
|
||||||
|
|
||||||
|
Monitors heartbeats from balance, motor driver, emergency, and docking nodes.
|
||||||
|
Publishes alerts on heartbeat loss >1s. Implements safety fallback: zeros cmd_vel
|
||||||
|
if motor driver lost >2s.
|
||||||
|
|
||||||
|
Published topics:
|
||||||
|
/saltybot/node_watchdog (std_msgs/String) - JSON watchdog status
|
||||||
|
/saltybot/cmd_vel_safe (geometry_msgs/Twist) - cmd_vel with motor driver safety check
|
||||||
|
|
||||||
|
Subscribed topics:
|
||||||
|
/saltybot/balance_heartbeat (std_msgs/UInt32) - Balance node heartbeat
|
||||||
|
/saltybot/motor_driver_heartbeat (std_msgs/UInt32) - Motor driver heartbeat
|
||||||
|
/saltybot/emergency_heartbeat (std_msgs/UInt32) - Emergency system heartbeat
|
||||||
|
/saltybot/docking_heartbeat (std_msgs/UInt32) - Docking node heartbeat
|
||||||
|
/cmd_vel (geometry_msgs/Twist) - Velocity command input
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import rclpy
|
||||||
|
from rclpy.node import Node
|
||||||
|
from rclpy.timer import Timer
|
||||||
|
from geometry_msgs.msg import Twist
|
||||||
|
from std_msgs.msg import UInt32, String
|
||||||
|
|
||||||
|
|
||||||
|
class NodeWatchdogNode(Node):
|
||||||
|
"""ROS2 watchdog monitor for critical system nodes."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__("node_watchdog")
|
||||||
|
|
||||||
|
# Parameters
|
||||||
|
self.declare_parameter("frequency", 20) # Hz
|
||||||
|
self.declare_parameter("heartbeat_timeout", 1.0) # seconds, general timeout
|
||||||
|
self.declare_parameter("motor_driver_critical_timeout", 2.0) # seconds
|
||||||
|
|
||||||
|
frequency = self.get_parameter("frequency").value
|
||||||
|
self.heartbeat_timeout = self.get_parameter("heartbeat_timeout").value
|
||||||
|
self.motor_driver_critical_timeout = self.get_parameter(
|
||||||
|
"motor_driver_critical_timeout"
|
||||||
|
).value
|
||||||
|
|
||||||
|
# Heartbeat tracking
|
||||||
|
self.critical_nodes = {
|
||||||
|
"balance": None,
|
||||||
|
"motor_driver": None,
|
||||||
|
"emergency": None,
|
||||||
|
"docking": None,
|
||||||
|
}
|
||||||
|
self.last_heartbeat_time = {
|
||||||
|
"balance": None,
|
||||||
|
"motor_driver": None,
|
||||||
|
"emergency": None,
|
||||||
|
"docking": None,
|
||||||
|
}
|
||||||
|
self.last_cmd_vel = None
|
||||||
|
self.motor_driver_down = False
|
||||||
|
|
||||||
|
# Subscriptions for heartbeats
|
||||||
|
self.create_subscription(
|
||||||
|
UInt32, "/saltybot/balance_heartbeat", self._on_balance_heartbeat, 10
|
||||||
|
)
|
||||||
|
self.create_subscription(
|
||||||
|
UInt32,
|
||||||
|
"/saltybot/motor_driver_heartbeat",
|
||||||
|
self._on_motor_driver_heartbeat,
|
||||||
|
10,
|
||||||
|
)
|
||||||
|
self.create_subscription(
|
||||||
|
UInt32, "/saltybot/emergency_heartbeat", self._on_emergency_heartbeat, 10
|
||||||
|
)
|
||||||
|
self.create_subscription(
|
||||||
|
UInt32, "/saltybot/docking_heartbeat", self._on_docking_heartbeat, 10
|
||||||
|
)
|
||||||
|
|
||||||
|
# cmd_vel subscription and safe republishing
|
||||||
|
self.create_subscription(Twist, "/cmd_vel", self._on_cmd_vel, 10)
|
||||||
|
|
||||||
|
# Publications
|
||||||
|
self.pub_watchdog = self.create_publisher(String, "/saltybot/node_watchdog", 10)
|
||||||
|
self.pub_cmd_vel_safe = self.create_publisher(
|
||||||
|
Twist, "/saltybot/cmd_vel_safe", 10
|
||||||
|
)
|
||||||
|
|
||||||
|
# Timer for periodic monitoring at 20Hz
|
||||||
|
period = 1.0 / frequency
|
||||||
|
self.timer: Timer = self.create_timer(period, self._timer_callback)
|
||||||
|
|
||||||
|
self.get_logger().info(
|
||||||
|
f"Node watchdog initialized at {frequency}Hz. "
|
||||||
|
f"Heartbeat timeout: {self.heartbeat_timeout}s, "
|
||||||
|
f"Motor driver critical: {self.motor_driver_critical_timeout}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _on_balance_heartbeat(self, msg: UInt32) -> None:
|
||||||
|
"""Update balance node heartbeat timestamp."""
|
||||||
|
self.last_heartbeat_time["balance"] = self.get_clock().now()
|
||||||
|
|
||||||
|
def _on_motor_driver_heartbeat(self, msg: UInt32) -> None:
|
||||||
|
"""Update motor driver heartbeat timestamp."""
|
||||||
|
self.last_heartbeat_time["motor_driver"] = self.get_clock().now()
|
||||||
|
self.motor_driver_down = False
|
||||||
|
|
||||||
|
def _on_emergency_heartbeat(self, msg: UInt32) -> None:
|
||||||
|
"""Update emergency system heartbeat timestamp."""
|
||||||
|
self.last_heartbeat_time["emergency"] = self.get_clock().now()
|
||||||
|
|
||||||
|
def _on_docking_heartbeat(self, msg: UInt32) -> None:
|
||||||
|
"""Update docking node heartbeat timestamp."""
|
||||||
|
self.last_heartbeat_time["docking"] = self.get_clock().now()
|
||||||
|
|
||||||
|
def _on_cmd_vel(self, msg: Twist) -> None:
|
||||||
|
"""Cache the last received cmd_vel."""
|
||||||
|
self.last_cmd_vel = msg
|
||||||
|
|
||||||
|
def _check_node_health(self) -> dict:
|
||||||
|
"""Check health of all monitored nodes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Health status of each node with timeout and elapsed time.
|
||||||
|
"""
|
||||||
|
now = self.get_clock().now()
|
||||||
|
health = {}
|
||||||
|
|
||||||
|
for node_name in self.critical_nodes:
|
||||||
|
last_time = self.last_heartbeat_time[node_name]
|
||||||
|
|
||||||
|
if last_time is None:
|
||||||
|
# No heartbeat received yet
|
||||||
|
health[node_name] = {
|
||||||
|
"status": "unknown",
|
||||||
|
"elapsed_s": None,
|
||||||
|
"timeout_s": self.heartbeat_timeout,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Calculate elapsed time since last heartbeat
|
||||||
|
elapsed = (now - last_time).nanoseconds / 1e9
|
||||||
|
is_timeout = elapsed > self.heartbeat_timeout
|
||||||
|
|
||||||
|
# Special case: motor driver has longer critical timeout
|
||||||
|
if node_name == "motor_driver":
|
||||||
|
is_critical = elapsed > self.motor_driver_critical_timeout
|
||||||
|
else:
|
||||||
|
is_critical = False
|
||||||
|
|
||||||
|
health[node_name] = {
|
||||||
|
"status": "down" if is_timeout else "up",
|
||||||
|
"elapsed_s": elapsed,
|
||||||
|
"timeout_s": (
|
||||||
|
self.motor_driver_critical_timeout
|
||||||
|
if node_name == "motor_driver"
|
||||||
|
else self.heartbeat_timeout
|
||||||
|
),
|
||||||
|
"critical": is_critical,
|
||||||
|
}
|
||||||
|
|
||||||
|
return health
|
||||||
|
|
||||||
|
def _timer_callback(self) -> None:
|
||||||
|
"""Monitor node health and publish alerts at 20Hz."""
|
||||||
|
health = self._check_node_health()
|
||||||
|
|
||||||
|
# Detect if motor driver is in critical state (down for >2s)
|
||||||
|
motor_driver_health = health.get("motor_driver", {})
|
||||||
|
if motor_driver_health.get("critical", False):
|
||||||
|
self.motor_driver_down = True
|
||||||
|
self.get_logger().warn(
|
||||||
|
f"MOTOR DRIVER DOWN >2s ({motor_driver_health['elapsed_s']:.1f}s). "
|
||||||
|
"Applying safety fallback: zeroing cmd_vel."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Determine any nodes down for >1s
|
||||||
|
nodes_with_timeout = {
|
||||||
|
name: status
|
||||||
|
for name, status in health.items()
|
||||||
|
if status.get("status") == "down"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Publish watchdog status
|
||||||
|
watchdog_status = {
|
||||||
|
"timestamp": self.get_clock().now().nanoseconds / 1e9,
|
||||||
|
"all_healthy": len(nodes_with_timeout) == 0
|
||||||
|
and not self.motor_driver_down,
|
||||||
|
"health": health,
|
||||||
|
"motor_driver_critical": self.motor_driver_down,
|
||||||
|
}
|
||||||
|
watchdog_msg = String(data=json.dumps(watchdog_status))
|
||||||
|
self.pub_watchdog.publish(watchdog_msg)
|
||||||
|
|
||||||
|
# Publish cmd_vel with safety checks
|
||||||
|
if self.last_cmd_vel is not None:
|
||||||
|
cmd_vel_safe = self._apply_safety_checks(self.last_cmd_vel)
|
||||||
|
self.pub_cmd_vel_safe.publish(cmd_vel_safe)
|
||||||
|
|
||||||
|
def _apply_safety_checks(self, cmd_vel: Twist) -> Twist:
|
||||||
|
"""Apply safety checks to cmd_vel based on system state.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cmd_vel: Original velocity command
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Twist: Potentially modified velocity command for safe operation.
|
||||||
|
"""
|
||||||
|
safe_cmd = Twist()
|
||||||
|
|
||||||
|
# If motor driver is critically down, zero all velocities
|
||||||
|
if self.motor_driver_down:
|
||||||
|
return safe_cmd
|
||||||
|
|
||||||
|
# Otherwise, pass through unchanged
|
||||||
|
safe_cmd.linear.x = cmd_vel.linear.x
|
||||||
|
safe_cmd.linear.y = cmd_vel.linear.y
|
||||||
|
safe_cmd.linear.z = cmd_vel.linear.z
|
||||||
|
safe_cmd.angular.x = cmd_vel.angular.x
|
||||||
|
safe_cmd.angular.y = cmd_vel.angular.y
|
||||||
|
safe_cmd.angular.z = cmd_vel.angular.z
|
||||||
|
return safe_cmd
|
||||||
|
|
||||||
|
|
||||||
|
def main(args=None):
|
||||||
|
rclpy.init(args=args)
|
||||||
|
node = NodeWatchdogNode()
|
||||||
|
try:
|
||||||
|
rclpy.spin(node)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
node.destroy_node()
|
||||||
|
rclpy.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
4
jetson/ros2_ws/src/saltybot_node_watchdog/setup.cfg
Normal file
4
jetson/ros2_ws/src/saltybot_node_watchdog/setup.cfg
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
[develop]
|
||||||
|
script_dir=$base/lib/saltybot_node_watchdog
|
||||||
|
[egg_info]
|
||||||
|
tag_date = 0
|
||||||
29
jetson/ros2_ws/src/saltybot_node_watchdog/setup.py
Normal file
29
jetson/ros2_ws/src/saltybot_node_watchdog/setup.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
package_name = "saltybot_node_watchdog"
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name=package_name,
|
||||||
|
version="0.1.0",
|
||||||
|
packages=[package_name],
|
||||||
|
data_files=[
|
||||||
|
("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
|
||||||
|
(f"share/{package_name}", ["package.xml"]),
|
||||||
|
(f"share/{package_name}/launch", ["launch/node_watchdog.launch.py"]),
|
||||||
|
(f"share/{package_name}/config", ["config/watchdog_config.yaml"]),
|
||||||
|
],
|
||||||
|
install_requires=["setuptools"],
|
||||||
|
zip_safe=True,
|
||||||
|
maintainer="sl-controls",
|
||||||
|
maintainer_email="sl-controls@saltylab.local",
|
||||||
|
description=(
|
||||||
|
"Node watchdog: heartbeat monitoring with safety fallback for critical systems"
|
||||||
|
),
|
||||||
|
license="MIT",
|
||||||
|
tests_require=["pytest"],
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
"node_watchdog_node = saltybot_node_watchdog.node_watchdog_node:main",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
@ -0,0 +1,329 @@
|
|||||||
|
"""Unit tests for node_watchdog_node."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from geometry_msgs.msg import Twist
|
||||||
|
from std_msgs.msg import UInt32, String
|
||||||
|
|
||||||
|
import rclpy
|
||||||
|
from rclpy.time import Time
|
||||||
|
|
||||||
|
# Import the node under test
|
||||||
|
from saltybot_node_watchdog.node_watchdog_node import NodeWatchdogNode
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def rclpy_fixture():
|
||||||
|
"""Initialize and cleanup rclpy."""
|
||||||
|
rclpy.init()
|
||||||
|
yield
|
||||||
|
rclpy.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def node(rclpy_fixture):
|
||||||
|
"""Create a watchdog node instance."""
|
||||||
|
node = NodeWatchdogNode()
|
||||||
|
yield node
|
||||||
|
node.destroy_node()
|
||||||
|
|
||||||
|
|
||||||
|
class TestNodeWatchdogNode:
|
||||||
|
"""Test suite for NodeWatchdogNode."""
|
||||||
|
|
||||||
|
def test_node_initialization(self, node):
|
||||||
|
"""Test that node initializes with correct defaults."""
|
||||||
|
assert node.heartbeat_timeout == 1.0
|
||||||
|
assert node.motor_driver_critical_timeout == 2.0
|
||||||
|
assert node.last_cmd_vel is None
|
||||||
|
assert node.motor_driver_down is False
|
||||||
|
|
||||||
|
# All heartbeat times should be None initially
|
||||||
|
for node_name in node.critical_nodes:
|
||||||
|
assert node.last_heartbeat_time[node_name] is None
|
||||||
|
|
||||||
|
def test_balance_heartbeat_received(self, node):
|
||||||
|
"""Test balance node heartbeat recording."""
|
||||||
|
msg = UInt32(data=1)
|
||||||
|
node._on_balance_heartbeat(msg)
|
||||||
|
assert node.last_heartbeat_time["balance"] is not None
|
||||||
|
|
||||||
|
def test_motor_driver_heartbeat_received(self, node):
|
||||||
|
"""Test motor driver heartbeat recording."""
|
||||||
|
msg = UInt32(data=1)
|
||||||
|
node._on_motor_driver_heartbeat(msg)
|
||||||
|
assert node.last_heartbeat_time["motor_driver"] is not None
|
||||||
|
# Motor driver heartbeat should clear the down flag
|
||||||
|
node.motor_driver_down = True
|
||||||
|
node._on_motor_driver_heartbeat(msg)
|
||||||
|
assert node.motor_driver_down is False
|
||||||
|
|
||||||
|
def test_emergency_heartbeat_received(self, node):
|
||||||
|
"""Test emergency system heartbeat recording."""
|
||||||
|
msg = UInt32(data=1)
|
||||||
|
node._on_emergency_heartbeat(msg)
|
||||||
|
assert node.last_heartbeat_time["emergency"] is not None
|
||||||
|
|
||||||
|
def test_docking_heartbeat_received(self, node):
|
||||||
|
"""Test docking node heartbeat recording."""
|
||||||
|
msg = UInt32(data=1)
|
||||||
|
node._on_docking_heartbeat(msg)
|
||||||
|
assert node.last_heartbeat_time["docking"] is not None
|
||||||
|
|
||||||
|
def test_cmd_vel_caching(self, node):
|
||||||
|
"""Test that cmd_vel messages are cached."""
|
||||||
|
msg = Twist()
|
||||||
|
msg.linear.x = 1.0
|
||||||
|
node._on_cmd_vel(msg)
|
||||||
|
assert node.last_cmd_vel is not None
|
||||||
|
assert node.last_cmd_vel.linear.x == 1.0
|
||||||
|
|
||||||
|
def test_health_check_all_unknown(self, node):
|
||||||
|
"""Test health check when no heartbeats received."""
|
||||||
|
health = node._check_node_health()
|
||||||
|
|
||||||
|
assert len(health) == 4
|
||||||
|
for node_name in node.critical_nodes:
|
||||||
|
assert health[node_name]["status"] == "unknown"
|
||||||
|
assert health[node_name]["elapsed_s"] is None
|
||||||
|
assert health[node_name]["timeout_s"] == 1.0
|
||||||
|
|
||||||
|
def test_health_check_just_received(self, node):
|
||||||
|
"""Test health check just after heartbeat received."""
|
||||||
|
# Record a heartbeat for balance node
|
||||||
|
node.last_heartbeat_time["balance"] = node.get_clock().now()
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
|
||||||
|
# Balance should be up, others unknown
|
||||||
|
assert health["balance"]["status"] == "up"
|
||||||
|
assert health["balance"]["elapsed_s"] < 0.1
|
||||||
|
assert health["emergency"]["status"] == "unknown"
|
||||||
|
|
||||||
|
def test_health_check_timeout_general(self, node):
|
||||||
|
"""Test that heartbeat timeout is detected (>1s)."""
|
||||||
|
# Simulate a heartbeat that arrived >1s ago
|
||||||
|
now = node.get_clock().now()
|
||||||
|
old_time = Time(
|
||||||
|
nanoseconds=now.nanoseconds - int(1.5 * 1e9)
|
||||||
|
) # 1.5 seconds ago
|
||||||
|
node.last_heartbeat_time["balance"] = old_time
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
|
||||||
|
assert health["balance"]["status"] == "down"
|
||||||
|
assert health["balance"]["elapsed_s"] > 1.4
|
||||||
|
assert health["balance"]["elapsed_s"] < 2.0
|
||||||
|
|
||||||
|
def test_health_check_motor_driver_critical(self, node):
|
||||||
|
"""Test motor driver critical timeout (>2s)."""
|
||||||
|
# Simulate motor driver heartbeat >2s ago
|
||||||
|
now = node.get_clock().now()
|
||||||
|
old_time = Time(nanoseconds=now.nanoseconds - int(2.5 * 1e9)) # 2.5 seconds
|
||||||
|
node.last_heartbeat_time["motor_driver"] = old_time
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
|
||||||
|
motor_health = health["motor_driver"]
|
||||||
|
assert motor_health["status"] == "down"
|
||||||
|
assert motor_health.get("critical", False) is True
|
||||||
|
assert motor_health["elapsed_s"] > 2.4
|
||||||
|
|
||||||
|
def test_safety_check_normal_operation(self, node):
|
||||||
|
"""Test safety check passes through cmd_vel normally."""
|
||||||
|
node.motor_driver_down = False
|
||||||
|
cmd = Twist()
|
||||||
|
cmd.linear.x = 1.5
|
||||||
|
cmd.angular.z = 0.3
|
||||||
|
|
||||||
|
safe_cmd = node._apply_safety_checks(cmd)
|
||||||
|
|
||||||
|
assert abs(safe_cmd.linear.x - 1.5) < 1e-6
|
||||||
|
assert abs(safe_cmd.angular.z - 0.3) < 1e-6
|
||||||
|
|
||||||
|
def test_safety_check_motor_driver_down(self, node):
|
||||||
|
"""Test safety check zeros cmd_vel when motor driver is down."""
|
||||||
|
node.motor_driver_down = True
|
||||||
|
cmd = Twist()
|
||||||
|
cmd.linear.x = 1.5
|
||||||
|
cmd.linear.y = 0.2
|
||||||
|
cmd.angular.z = 0.3
|
||||||
|
|
||||||
|
safe_cmd = node._apply_safety_checks(cmd)
|
||||||
|
|
||||||
|
# All velocities should be zero
|
||||||
|
assert safe_cmd.linear.x == 0.0
|
||||||
|
assert safe_cmd.linear.y == 0.0
|
||||||
|
assert safe_cmd.linear.z == 0.0
|
||||||
|
assert safe_cmd.angular.x == 0.0
|
||||||
|
assert safe_cmd.angular.y == 0.0
|
||||||
|
assert safe_cmd.angular.z == 0.0
|
||||||
|
|
||||||
|
def test_timer_callback_publishes(self, node):
|
||||||
|
"""Test that timer callback publishes watchdog status."""
|
||||||
|
# Record a heartbeat
|
||||||
|
node.last_heartbeat_time["balance"] = node.get_clock().now()
|
||||||
|
node.last_cmd_vel = Twist()
|
||||||
|
node.last_cmd_vel.linear.x = 1.0
|
||||||
|
|
||||||
|
# Call timer callback
|
||||||
|
node._timer_callback()
|
||||||
|
# Just verify it doesn't crash; actual publishing is tested via integration
|
||||||
|
|
||||||
|
def test_watchdog_status_json_all_healthy(self, node):
|
||||||
|
"""Test watchdog status JSON when all nodes healthy."""
|
||||||
|
# Record all heartbeats
|
||||||
|
now = node.get_clock().now()
|
||||||
|
for node_name in node.critical_nodes:
|
||||||
|
node.last_heartbeat_time[node_name] = now
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
watchdog_status = {
|
||||||
|
"timestamp": now.nanoseconds / 1e9,
|
||||||
|
"all_healthy": all(
|
||||||
|
s["status"] == "up" for s in health.values()
|
||||||
|
),
|
||||||
|
"health": health,
|
||||||
|
"motor_driver_critical": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Verify it's valid JSON
|
||||||
|
json_str = json.dumps(watchdog_status)
|
||||||
|
parsed = json.loads(json_str)
|
||||||
|
|
||||||
|
assert parsed["all_healthy"] is True
|
||||||
|
assert parsed["motor_driver_critical"] is False
|
||||||
|
|
||||||
|
def test_watchdog_status_json_with_timeout(self, node):
|
||||||
|
"""Test watchdog status JSON when node has timed out."""
|
||||||
|
# Balance heartbeat >1s ago
|
||||||
|
now = node.get_clock().now()
|
||||||
|
old_time = Time(nanoseconds=now.nanoseconds - int(1.5 * 1e9))
|
||||||
|
node.last_heartbeat_time["balance"] = old_time
|
||||||
|
|
||||||
|
# Others are current
|
||||||
|
for name in ["motor_driver", "emergency", "docking"]:
|
||||||
|
node.last_heartbeat_time[name] = now
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
watchdog_status = {
|
||||||
|
"timestamp": now.nanoseconds / 1e9,
|
||||||
|
"all_healthy": all(s["status"] == "up" for s in health.values()),
|
||||||
|
"health": health,
|
||||||
|
"motor_driver_critical": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
json_str = json.dumps(watchdog_status)
|
||||||
|
parsed = json.loads(json_str)
|
||||||
|
|
||||||
|
assert parsed["all_healthy"] is False
|
||||||
|
assert parsed["health"]["balance"]["status"] == "down"
|
||||||
|
|
||||||
|
|
||||||
|
class TestNodeWatchdogScenarios:
|
||||||
|
"""Integration-style tests for realistic scenarios."""
|
||||||
|
|
||||||
|
def test_scenario_all_nodes_healthy(self, node):
|
||||||
|
"""Scenario: all critical nodes sending heartbeats."""
|
||||||
|
now = node.get_clock().now()
|
||||||
|
|
||||||
|
# All nodes sending heartbeats
|
||||||
|
for name in node.critical_nodes:
|
||||||
|
node.last_heartbeat_time[name] = now
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
|
||||||
|
all_up = all(h["status"] == "up" for h in health.values())
|
||||||
|
assert all_up is True
|
||||||
|
|
||||||
|
def test_scenario_motor_driver_loss_below_critical(self, node):
|
||||||
|
"""Scenario: motor driver offline 1.5s (below 2s critical)."""
|
||||||
|
now = node.get_clock().now()
|
||||||
|
old_time = Time(nanoseconds=now.nanoseconds - int(1.5 * 1e9))
|
||||||
|
|
||||||
|
# Motor driver down 1.5s, others healthy
|
||||||
|
node.last_heartbeat_time["motor_driver"] = old_time
|
||||||
|
for name in ["balance", "emergency", "docking"]:
|
||||||
|
node.last_heartbeat_time[name] = now
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
motor = health["motor_driver"]
|
||||||
|
|
||||||
|
assert motor["status"] == "down"
|
||||||
|
assert motor.get("critical", False) is False
|
||||||
|
# Safety fallback should NOT be triggered yet
|
||||||
|
assert node.motor_driver_down is False
|
||||||
|
|
||||||
|
def test_scenario_motor_driver_critical_loss(self, node):
|
||||||
|
"""Scenario: motor driver offline >2s (triggers critical)."""
|
||||||
|
now = node.get_clock().now()
|
||||||
|
old_time = Time(nanoseconds=now.nanoseconds - int(2.5 * 1e9))
|
||||||
|
|
||||||
|
node.last_heartbeat_time["motor_driver"] = old_time
|
||||||
|
node.last_heartbeat_time["balance"] = now
|
||||||
|
node.last_heartbeat_time["emergency"] = now
|
||||||
|
node.last_heartbeat_time["docking"] = now
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
motor = health["motor_driver"]
|
||||||
|
|
||||||
|
assert motor["status"] == "down"
|
||||||
|
assert motor.get("critical", False) is True
|
||||||
|
|
||||||
|
def test_scenario_cascading_node_failures(self, node):
|
||||||
|
"""Scenario: multiple nodes failing in sequence."""
|
||||||
|
now = node.get_clock().now()
|
||||||
|
old1s = Time(nanoseconds=now.nanoseconds - int(1.2 * 1e9))
|
||||||
|
old2s = Time(nanoseconds=now.nanoseconds - int(2.5 * 1e9))
|
||||||
|
|
||||||
|
# Balance down 1.2s, motor driver down 2.5s, others healthy
|
||||||
|
node.last_heartbeat_time["balance"] = old1s
|
||||||
|
node.last_heartbeat_time["motor_driver"] = old2s
|
||||||
|
node.last_heartbeat_time["emergency"] = now
|
||||||
|
node.last_heartbeat_time["docking"] = now
|
||||||
|
|
||||||
|
health = node._check_node_health()
|
||||||
|
|
||||||
|
assert health["balance"]["status"] == "down"
|
||||||
|
assert health["balance"].get("critical", False) is False
|
||||||
|
assert health["motor_driver"]["status"] == "down"
|
||||||
|
assert health["motor_driver"].get("critical", False) is True
|
||||||
|
|
||||||
|
def test_scenario_cmd_vel_safety_fallback(self, node):
|
||||||
|
"""Scenario: motor driver down triggers safety zeroing of cmd_vel."""
|
||||||
|
# Motor driver is critically down
|
||||||
|
node.motor_driver_down = True
|
||||||
|
|
||||||
|
cmd = Twist()
|
||||||
|
cmd.linear.x = 2.0
|
||||||
|
cmd.angular.z = 0.5
|
||||||
|
|
||||||
|
safe_cmd = node._apply_safety_checks(cmd)
|
||||||
|
|
||||||
|
# All should be zeroed
|
||||||
|
assert safe_cmd.linear.x == 0.0
|
||||||
|
assert safe_cmd.linear.y == 0.0
|
||||||
|
assert safe_cmd.linear.z == 0.0
|
||||||
|
assert safe_cmd.angular.x == 0.0
|
||||||
|
assert safe_cmd.angular.y == 0.0
|
||||||
|
assert safe_cmd.angular.z == 0.0
|
||||||
|
|
||||||
|
def test_scenario_motor_driver_recovery(self, node):
|
||||||
|
"""Scenario: motor driver comes back online after being down."""
|
||||||
|
now = node.get_clock().now()
|
||||||
|
|
||||||
|
# Motor driver was down
|
||||||
|
node.motor_driver_down = True
|
||||||
|
|
||||||
|
# Motor driver sends heartbeat
|
||||||
|
node._on_motor_driver_heartbeat(UInt32(data=1))
|
||||||
|
|
||||||
|
# Should clear the down flag
|
||||||
|
assert node.motor_driver_down is False
|
||||||
|
|
||||||
|
# cmd_vel should pass through
|
||||||
|
cmd = Twist()
|
||||||
|
cmd.linear.x = 1.0
|
||||||
|
safe_cmd = node._apply_safety_checks(cmd)
|
||||||
|
assert safe_cmd.linear.x == 1.0
|
||||||
Loading…
x
Reference in New Issue
Block a user