feat: Add Issue #203 - Node watchdog monitor (20Hz heartbeat detection)

Implements node watchdog ROS2 node that monitors heartbeats from critical
systems and triggers safety fallback when motor driver is lost >2s.

Features:
  - Monitor heartbeats from: balance, motor_driver, emergency, docking
  - Alert on /saltybot/node_watchdog (JSON) if heartbeat lost >1s
  - Safety fallback: zero cmd_vel if motor driver lost >2s
  - Republish cmd_vel on /saltybot/cmd_vel_safe with safety checks
  - 20Hz monitoring and publishing frequency
  - Configurable heartbeat timeout thresholds

Heartbeat Topics:
  - /saltybot/balance_heartbeat (std_msgs/UInt32)
  - /saltybot/motor_driver_heartbeat (std_msgs/UInt32)
  - /saltybot/emergency_heartbeat (std_msgs/UInt32)
  - /saltybot/docking_heartbeat (std_msgs/UInt32)
  - /cmd_vel (geometry_msgs/Twist)

Published Topics:
  - /saltybot/node_watchdog (std_msgs/String) - JSON status
  - /saltybot/cmd_vel_safe (geometry_msgs/Twist) - Safety-checked velocity

Package: saltybot_node_watchdog
Entry point: node_watchdog_node
Launch file: node_watchdog.launch.py

Tests: 20+ unit tests covering:
  - Heartbeat reception and timeout detection
  - Motor driver critical timeout (>2s)
  - Safety fallback logic
  - cmd_vel zeroing on motor driver loss
  - Health status JSON serialization
  - Multi-node failure scenarios

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
sl-controls 2026-03-02 11:17:02 -05:00 committed by sl-mechanical
parent 293a2a3096
commit 7d09c8c814
10 changed files with 675 additions and 0 deletions

View File

@ -0,0 +1,14 @@
# Node watchdog configuration
node_watchdog:
ros__parameters:
# Publishing frequency in Hz
frequency: 20 # 20 Hz = 50ms cycle
# General heartbeat timeout (seconds)
# Alert if any heartbeat lost for this duration
heartbeat_timeout: 1.0
# Motor driver critical timeout (seconds)
# Trigger safety fallback (zero cmd_vel) if motor driver down this long
motor_driver_critical_timeout: 2.0

View File

@ -0,0 +1,36 @@
"""Launch file for node_watchdog_node."""
from launch import LaunchDescription
from launch_ros.actions import Node
from launch.substitutions import LaunchConfiguration
from launch.actions import DeclareLaunchArgument
import os
from ament_index_python.packages import get_package_share_directory
def generate_launch_description():
"""Generate launch description for node watchdog."""
# Package directory
pkg_dir = get_package_share_directory("saltybot_node_watchdog")
# Parameters
config_file = os.path.join(pkg_dir, "config", "watchdog_config.yaml")
# Declare launch arguments
return LaunchDescription(
[
DeclareLaunchArgument(
"config_file",
default_value=config_file,
description="Path to configuration YAML file",
),
# Node watchdog node
Node(
package="saltybot_node_watchdog",
executable="node_watchdog_node",
name="node_watchdog",
output="screen",
parameters=[LaunchConfiguration("config_file")],
),
]
)

View File

@ -0,0 +1,28 @@
<?xml version="1.0"?>
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
<package format="3">
<name>saltybot_node_watchdog</name>
<version>0.1.0</version>
<description>
Node watchdog monitor for SaltyBot critical systems. Monitors heartbeats from balance,
motor driver, emergency, and docking nodes. Publishes alerts on heartbeat loss >1s.
Implements safety fallback: zeros cmd_vel if motor driver lost >2s. Runs at 20Hz.
</description>
<maintainer email="sl-controls@saltylab.local">sl-controls</maintainer>
<license>MIT</license>
<depend>rclpy</depend>
<depend>geometry_msgs</depend>
<depend>std_msgs</depend>
<buildtool_depend>ament_python</buildtool_depend>
<test_depend>ament_copyright</test_depend>
<test_depend>ament_flake8</test_depend>
<test_depend>ament_pep257</test_depend>
<test_depend>python3-pytest</test_depend>
<export>
<build_type>ament_python</build_type>
</export>
</package>

View File

@ -0,0 +1,235 @@
#!/usr/bin/env python3
"""Node watchdog monitor for SaltyBot critical systems.
Monitors heartbeats from balance, motor driver, emergency, and docking nodes.
Publishes alerts on heartbeat loss >1s. Implements safety fallback: zeros cmd_vel
if motor driver lost >2s.
Published topics:
/saltybot/node_watchdog (std_msgs/String) - JSON watchdog status
/saltybot/cmd_vel_safe (geometry_msgs/Twist) - cmd_vel with motor driver safety check
Subscribed topics:
/saltybot/balance_heartbeat (std_msgs/UInt32) - Balance node heartbeat
/saltybot/motor_driver_heartbeat (std_msgs/UInt32) - Motor driver heartbeat
/saltybot/emergency_heartbeat (std_msgs/UInt32) - Emergency system heartbeat
/saltybot/docking_heartbeat (std_msgs/UInt32) - Docking node heartbeat
/cmd_vel (geometry_msgs/Twist) - Velocity command input
"""
import json
import rclpy
from rclpy.node import Node
from rclpy.timer import Timer
from geometry_msgs.msg import Twist
from std_msgs.msg import UInt32, String
class NodeWatchdogNode(Node):
"""ROS2 watchdog monitor for critical system nodes."""
def __init__(self):
super().__init__("node_watchdog")
# Parameters
self.declare_parameter("frequency", 20) # Hz
self.declare_parameter("heartbeat_timeout", 1.0) # seconds, general timeout
self.declare_parameter("motor_driver_critical_timeout", 2.0) # seconds
frequency = self.get_parameter("frequency").value
self.heartbeat_timeout = self.get_parameter("heartbeat_timeout").value
self.motor_driver_critical_timeout = self.get_parameter(
"motor_driver_critical_timeout"
).value
# Heartbeat tracking
self.critical_nodes = {
"balance": None,
"motor_driver": None,
"emergency": None,
"docking": None,
}
self.last_heartbeat_time = {
"balance": None,
"motor_driver": None,
"emergency": None,
"docking": None,
}
self.last_cmd_vel = None
self.motor_driver_down = False
# Subscriptions for heartbeats
self.create_subscription(
UInt32, "/saltybot/balance_heartbeat", self._on_balance_heartbeat, 10
)
self.create_subscription(
UInt32,
"/saltybot/motor_driver_heartbeat",
self._on_motor_driver_heartbeat,
10,
)
self.create_subscription(
UInt32, "/saltybot/emergency_heartbeat", self._on_emergency_heartbeat, 10
)
self.create_subscription(
UInt32, "/saltybot/docking_heartbeat", self._on_docking_heartbeat, 10
)
# cmd_vel subscription and safe republishing
self.create_subscription(Twist, "/cmd_vel", self._on_cmd_vel, 10)
# Publications
self.pub_watchdog = self.create_publisher(String, "/saltybot/node_watchdog", 10)
self.pub_cmd_vel_safe = self.create_publisher(
Twist, "/saltybot/cmd_vel_safe", 10
)
# Timer for periodic monitoring at 20Hz
period = 1.0 / frequency
self.timer: Timer = self.create_timer(period, self._timer_callback)
self.get_logger().info(
f"Node watchdog initialized at {frequency}Hz. "
f"Heartbeat timeout: {self.heartbeat_timeout}s, "
f"Motor driver critical: {self.motor_driver_critical_timeout}s"
)
def _on_balance_heartbeat(self, msg: UInt32) -> None:
"""Update balance node heartbeat timestamp."""
self.last_heartbeat_time["balance"] = self.get_clock().now()
def _on_motor_driver_heartbeat(self, msg: UInt32) -> None:
"""Update motor driver heartbeat timestamp."""
self.last_heartbeat_time["motor_driver"] = self.get_clock().now()
self.motor_driver_down = False
def _on_emergency_heartbeat(self, msg: UInt32) -> None:
"""Update emergency system heartbeat timestamp."""
self.last_heartbeat_time["emergency"] = self.get_clock().now()
def _on_docking_heartbeat(self, msg: UInt32) -> None:
"""Update docking node heartbeat timestamp."""
self.last_heartbeat_time["docking"] = self.get_clock().now()
def _on_cmd_vel(self, msg: Twist) -> None:
"""Cache the last received cmd_vel."""
self.last_cmd_vel = msg
def _check_node_health(self) -> dict:
"""Check health of all monitored nodes.
Returns:
dict: Health status of each node with timeout and elapsed time.
"""
now = self.get_clock().now()
health = {}
for node_name in self.critical_nodes:
last_time = self.last_heartbeat_time[node_name]
if last_time is None:
# No heartbeat received yet
health[node_name] = {
"status": "unknown",
"elapsed_s": None,
"timeout_s": self.heartbeat_timeout,
}
else:
# Calculate elapsed time since last heartbeat
elapsed = (now - last_time).nanoseconds / 1e9
is_timeout = elapsed > self.heartbeat_timeout
# Special case: motor driver has longer critical timeout
if node_name == "motor_driver":
is_critical = elapsed > self.motor_driver_critical_timeout
else:
is_critical = False
health[node_name] = {
"status": "down" if is_timeout else "up",
"elapsed_s": elapsed,
"timeout_s": (
self.motor_driver_critical_timeout
if node_name == "motor_driver"
else self.heartbeat_timeout
),
"critical": is_critical,
}
return health
def _timer_callback(self) -> None:
"""Monitor node health and publish alerts at 20Hz."""
health = self._check_node_health()
# Detect if motor driver is in critical state (down for >2s)
motor_driver_health = health.get("motor_driver", {})
if motor_driver_health.get("critical", False):
self.motor_driver_down = True
self.get_logger().warn(
f"MOTOR DRIVER DOWN >2s ({motor_driver_health['elapsed_s']:.1f}s). "
"Applying safety fallback: zeroing cmd_vel."
)
# Determine any nodes down for >1s
nodes_with_timeout = {
name: status
for name, status in health.items()
if status.get("status") == "down"
}
# Publish watchdog status
watchdog_status = {
"timestamp": self.get_clock().now().nanoseconds / 1e9,
"all_healthy": len(nodes_with_timeout) == 0
and not self.motor_driver_down,
"health": health,
"motor_driver_critical": self.motor_driver_down,
}
watchdog_msg = String(data=json.dumps(watchdog_status))
self.pub_watchdog.publish(watchdog_msg)
# Publish cmd_vel with safety checks
if self.last_cmd_vel is not None:
cmd_vel_safe = self._apply_safety_checks(self.last_cmd_vel)
self.pub_cmd_vel_safe.publish(cmd_vel_safe)
def _apply_safety_checks(self, cmd_vel: Twist) -> Twist:
"""Apply safety checks to cmd_vel based on system state.
Args:
cmd_vel: Original velocity command
Returns:
Twist: Potentially modified velocity command for safe operation.
"""
safe_cmd = Twist()
# If motor driver is critically down, zero all velocities
if self.motor_driver_down:
return safe_cmd
# Otherwise, pass through unchanged
safe_cmd.linear.x = cmd_vel.linear.x
safe_cmd.linear.y = cmd_vel.linear.y
safe_cmd.linear.z = cmd_vel.linear.z
safe_cmd.angular.x = cmd_vel.angular.x
safe_cmd.angular.y = cmd_vel.angular.y
safe_cmd.angular.z = cmd_vel.angular.z
return safe_cmd
def main(args=None):
rclpy.init(args=args)
node = NodeWatchdogNode()
try:
rclpy.spin(node)
except KeyboardInterrupt:
pass
finally:
node.destroy_node()
rclpy.shutdown()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,4 @@
[develop]
script_dir=$base/lib/saltybot_node_watchdog
[egg_info]
tag_date = 0

View File

@ -0,0 +1,29 @@
from setuptools import setup
package_name = "saltybot_node_watchdog"
setup(
name=package_name,
version="0.1.0",
packages=[package_name],
data_files=[
("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
(f"share/{package_name}", ["package.xml"]),
(f"share/{package_name}/launch", ["launch/node_watchdog.launch.py"]),
(f"share/{package_name}/config", ["config/watchdog_config.yaml"]),
],
install_requires=["setuptools"],
zip_safe=True,
maintainer="sl-controls",
maintainer_email="sl-controls@saltylab.local",
description=(
"Node watchdog: heartbeat monitoring with safety fallback for critical systems"
),
license="MIT",
tests_require=["pytest"],
entry_points={
"console_scripts": [
"node_watchdog_node = saltybot_node_watchdog.node_watchdog_node:main",
],
},
)

View File

@ -0,0 +1,329 @@
"""Unit tests for node_watchdog_node."""
import pytest
import json
import time
from geometry_msgs.msg import Twist
from std_msgs.msg import UInt32, String
import rclpy
from rclpy.time import Time
# Import the node under test
from saltybot_node_watchdog.node_watchdog_node import NodeWatchdogNode
@pytest.fixture
def rclpy_fixture():
"""Initialize and cleanup rclpy."""
rclpy.init()
yield
rclpy.shutdown()
@pytest.fixture
def node(rclpy_fixture):
"""Create a watchdog node instance."""
node = NodeWatchdogNode()
yield node
node.destroy_node()
class TestNodeWatchdogNode:
"""Test suite for NodeWatchdogNode."""
def test_node_initialization(self, node):
"""Test that node initializes with correct defaults."""
assert node.heartbeat_timeout == 1.0
assert node.motor_driver_critical_timeout == 2.0
assert node.last_cmd_vel is None
assert node.motor_driver_down is False
# All heartbeat times should be None initially
for node_name in node.critical_nodes:
assert node.last_heartbeat_time[node_name] is None
def test_balance_heartbeat_received(self, node):
"""Test balance node heartbeat recording."""
msg = UInt32(data=1)
node._on_balance_heartbeat(msg)
assert node.last_heartbeat_time["balance"] is not None
def test_motor_driver_heartbeat_received(self, node):
"""Test motor driver heartbeat recording."""
msg = UInt32(data=1)
node._on_motor_driver_heartbeat(msg)
assert node.last_heartbeat_time["motor_driver"] is not None
# Motor driver heartbeat should clear the down flag
node.motor_driver_down = True
node._on_motor_driver_heartbeat(msg)
assert node.motor_driver_down is False
def test_emergency_heartbeat_received(self, node):
"""Test emergency system heartbeat recording."""
msg = UInt32(data=1)
node._on_emergency_heartbeat(msg)
assert node.last_heartbeat_time["emergency"] is not None
def test_docking_heartbeat_received(self, node):
"""Test docking node heartbeat recording."""
msg = UInt32(data=1)
node._on_docking_heartbeat(msg)
assert node.last_heartbeat_time["docking"] is not None
def test_cmd_vel_caching(self, node):
"""Test that cmd_vel messages are cached."""
msg = Twist()
msg.linear.x = 1.0
node._on_cmd_vel(msg)
assert node.last_cmd_vel is not None
assert node.last_cmd_vel.linear.x == 1.0
def test_health_check_all_unknown(self, node):
"""Test health check when no heartbeats received."""
health = node._check_node_health()
assert len(health) == 4
for node_name in node.critical_nodes:
assert health[node_name]["status"] == "unknown"
assert health[node_name]["elapsed_s"] is None
assert health[node_name]["timeout_s"] == 1.0
def test_health_check_just_received(self, node):
"""Test health check just after heartbeat received."""
# Record a heartbeat for balance node
node.last_heartbeat_time["balance"] = node.get_clock().now()
health = node._check_node_health()
# Balance should be up, others unknown
assert health["balance"]["status"] == "up"
assert health["balance"]["elapsed_s"] < 0.1
assert health["emergency"]["status"] == "unknown"
def test_health_check_timeout_general(self, node):
"""Test that heartbeat timeout is detected (>1s)."""
# Simulate a heartbeat that arrived >1s ago
now = node.get_clock().now()
old_time = Time(
nanoseconds=now.nanoseconds - int(1.5 * 1e9)
) # 1.5 seconds ago
node.last_heartbeat_time["balance"] = old_time
health = node._check_node_health()
assert health["balance"]["status"] == "down"
assert health["balance"]["elapsed_s"] > 1.4
assert health["balance"]["elapsed_s"] < 2.0
def test_health_check_motor_driver_critical(self, node):
"""Test motor driver critical timeout (>2s)."""
# Simulate motor driver heartbeat >2s ago
now = node.get_clock().now()
old_time = Time(nanoseconds=now.nanoseconds - int(2.5 * 1e9)) # 2.5 seconds
node.last_heartbeat_time["motor_driver"] = old_time
health = node._check_node_health()
motor_health = health["motor_driver"]
assert motor_health["status"] == "down"
assert motor_health.get("critical", False) is True
assert motor_health["elapsed_s"] > 2.4
def test_safety_check_normal_operation(self, node):
"""Test safety check passes through cmd_vel normally."""
node.motor_driver_down = False
cmd = Twist()
cmd.linear.x = 1.5
cmd.angular.z = 0.3
safe_cmd = node._apply_safety_checks(cmd)
assert abs(safe_cmd.linear.x - 1.5) < 1e-6
assert abs(safe_cmd.angular.z - 0.3) < 1e-6
def test_safety_check_motor_driver_down(self, node):
"""Test safety check zeros cmd_vel when motor driver is down."""
node.motor_driver_down = True
cmd = Twist()
cmd.linear.x = 1.5
cmd.linear.y = 0.2
cmd.angular.z = 0.3
safe_cmd = node._apply_safety_checks(cmd)
# All velocities should be zero
assert safe_cmd.linear.x == 0.0
assert safe_cmd.linear.y == 0.0
assert safe_cmd.linear.z == 0.0
assert safe_cmd.angular.x == 0.0
assert safe_cmd.angular.y == 0.0
assert safe_cmd.angular.z == 0.0
def test_timer_callback_publishes(self, node):
"""Test that timer callback publishes watchdog status."""
# Record a heartbeat
node.last_heartbeat_time["balance"] = node.get_clock().now()
node.last_cmd_vel = Twist()
node.last_cmd_vel.linear.x = 1.0
# Call timer callback
node._timer_callback()
# Just verify it doesn't crash; actual publishing is tested via integration
def test_watchdog_status_json_all_healthy(self, node):
"""Test watchdog status JSON when all nodes healthy."""
# Record all heartbeats
now = node.get_clock().now()
for node_name in node.critical_nodes:
node.last_heartbeat_time[node_name] = now
health = node._check_node_health()
watchdog_status = {
"timestamp": now.nanoseconds / 1e9,
"all_healthy": all(
s["status"] == "up" for s in health.values()
),
"health": health,
"motor_driver_critical": False,
}
# Verify it's valid JSON
json_str = json.dumps(watchdog_status)
parsed = json.loads(json_str)
assert parsed["all_healthy"] is True
assert parsed["motor_driver_critical"] is False
def test_watchdog_status_json_with_timeout(self, node):
"""Test watchdog status JSON when node has timed out."""
# Balance heartbeat >1s ago
now = node.get_clock().now()
old_time = Time(nanoseconds=now.nanoseconds - int(1.5 * 1e9))
node.last_heartbeat_time["balance"] = old_time
# Others are current
for name in ["motor_driver", "emergency", "docking"]:
node.last_heartbeat_time[name] = now
health = node._check_node_health()
watchdog_status = {
"timestamp": now.nanoseconds / 1e9,
"all_healthy": all(s["status"] == "up" for s in health.values()),
"health": health,
"motor_driver_critical": False,
}
json_str = json.dumps(watchdog_status)
parsed = json.loads(json_str)
assert parsed["all_healthy"] is False
assert parsed["health"]["balance"]["status"] == "down"
class TestNodeWatchdogScenarios:
"""Integration-style tests for realistic scenarios."""
def test_scenario_all_nodes_healthy(self, node):
"""Scenario: all critical nodes sending heartbeats."""
now = node.get_clock().now()
# All nodes sending heartbeats
for name in node.critical_nodes:
node.last_heartbeat_time[name] = now
health = node._check_node_health()
all_up = all(h["status"] == "up" for h in health.values())
assert all_up is True
def test_scenario_motor_driver_loss_below_critical(self, node):
"""Scenario: motor driver offline 1.5s (below 2s critical)."""
now = node.get_clock().now()
old_time = Time(nanoseconds=now.nanoseconds - int(1.5 * 1e9))
# Motor driver down 1.5s, others healthy
node.last_heartbeat_time["motor_driver"] = old_time
for name in ["balance", "emergency", "docking"]:
node.last_heartbeat_time[name] = now
health = node._check_node_health()
motor = health["motor_driver"]
assert motor["status"] == "down"
assert motor.get("critical", False) is False
# Safety fallback should NOT be triggered yet
assert node.motor_driver_down is False
def test_scenario_motor_driver_critical_loss(self, node):
"""Scenario: motor driver offline >2s (triggers critical)."""
now = node.get_clock().now()
old_time = Time(nanoseconds=now.nanoseconds - int(2.5 * 1e9))
node.last_heartbeat_time["motor_driver"] = old_time
node.last_heartbeat_time["balance"] = now
node.last_heartbeat_time["emergency"] = now
node.last_heartbeat_time["docking"] = now
health = node._check_node_health()
motor = health["motor_driver"]
assert motor["status"] == "down"
assert motor.get("critical", False) is True
def test_scenario_cascading_node_failures(self, node):
"""Scenario: multiple nodes failing in sequence."""
now = node.get_clock().now()
old1s = Time(nanoseconds=now.nanoseconds - int(1.2 * 1e9))
old2s = Time(nanoseconds=now.nanoseconds - int(2.5 * 1e9))
# Balance down 1.2s, motor driver down 2.5s, others healthy
node.last_heartbeat_time["balance"] = old1s
node.last_heartbeat_time["motor_driver"] = old2s
node.last_heartbeat_time["emergency"] = now
node.last_heartbeat_time["docking"] = now
health = node._check_node_health()
assert health["balance"]["status"] == "down"
assert health["balance"].get("critical", False) is False
assert health["motor_driver"]["status"] == "down"
assert health["motor_driver"].get("critical", False) is True
def test_scenario_cmd_vel_safety_fallback(self, node):
"""Scenario: motor driver down triggers safety zeroing of cmd_vel."""
# Motor driver is critically down
node.motor_driver_down = True
cmd = Twist()
cmd.linear.x = 2.0
cmd.angular.z = 0.5
safe_cmd = node._apply_safety_checks(cmd)
# All should be zeroed
assert safe_cmd.linear.x == 0.0
assert safe_cmd.linear.y == 0.0
assert safe_cmd.linear.z == 0.0
assert safe_cmd.angular.x == 0.0
assert safe_cmd.angular.y == 0.0
assert safe_cmd.angular.z == 0.0
def test_scenario_motor_driver_recovery(self, node):
"""Scenario: motor driver comes back online after being down."""
now = node.get_clock().now()
# Motor driver was down
node.motor_driver_down = True
# Motor driver sends heartbeat
node._on_motor_driver_heartbeat(UInt32(data=1))
# Should clear the down flag
assert node.motor_driver_down is False
# cmd_vel should pass through
cmd = Twist()
cmd.linear.x = 1.0
safe_cmd = node._apply_safety_checks(cmd)
assert safe_cmd.linear.x == 1.0