diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/config/face_bridge_params.yaml b/jetson/ros2_ws/src/saltybot_face_bridge/config/face_bridge_params.yaml new file mode 100644 index 0000000..8fc3a7b --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_face_bridge/config/face_bridge_params.yaml @@ -0,0 +1,13 @@ +face_bridge: + # HTTP server endpoint for face display + face_server_url: "http://localhost:3000/face/{id}" # {id} replaced with expression ID + + # HTTP request settings + http_timeout: 2.0 # Request timeout in seconds + update_interval: 0.1 # Update check interval in seconds + + # State to expression mapping: + # 0 = Tracking (IDLE, THROTTLED) + # 1 = Alert (LISTENING, wake word) + # 3 = Searching (THINKING) + # 4 = Social (SPEAKING) diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/launch/face_bridge.launch.py b/jetson/ros2_ws/src/saltybot_face_bridge/launch/face_bridge.launch.py new file mode 100644 index 0000000..69681d1 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_face_bridge/launch/face_bridge.launch.py @@ -0,0 +1,40 @@ +"""Launch file for face display bridge node.""" + +from launch import LaunchDescription +from launch_ros.actions import Node +from launch.substitutions import LaunchConfiguration +from launch.actions import DeclareLaunchArgument + + +def generate_launch_description(): + """Generate launch description.""" + # Declare arguments + url_arg = DeclareLaunchArgument( + "face_server_url", + default_value="http://localhost:3000/face/{id}", + description="Face display server HTTP endpoint" + ) + timeout_arg = DeclareLaunchArgument( + "http_timeout", + default_value="2.0", + description="HTTP request timeout in seconds" + ) + + # Create node + face_bridge_node = Node( + package="saltybot_face_bridge", + executable="face_bridge_node", + name="face_bridge", + parameters=[ + {"face_server_url": LaunchConfiguration("face_server_url")}, + {"http_timeout": LaunchConfiguration("http_timeout")}, + {"update_interval": 0.1}, + ], + output="screen", + ) + + return LaunchDescription([ + url_arg, + timeout_arg, + face_bridge_node, + ]) diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/package.xml b/jetson/ros2_ws/src/saltybot_face_bridge/package.xml new file mode 100644 index 0000000..f9504d3 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_face_bridge/package.xml @@ -0,0 +1,26 @@ + + + + saltybot_face_bridge + 0.1.0 + + Face display bridge node for orchestrator state to face expression mapping. + Maps social/orchestrator state to face display WebSocket API. + + sl-controls + MIT + + rclpy + std_msgs + + ament_python + + ament_copyright + ament_flake8 + ament_pep257 + python3-pytest + + + ament_python + + diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/resource/saltybot_face_bridge b/jetson/ros2_ws/src/saltybot_face_bridge/resource/saltybot_face_bridge new file mode 100644 index 0000000..e69de29 diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/saltybot_face_bridge/__init__.py b/jetson/ros2_ws/src/saltybot_face_bridge/saltybot_face_bridge/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/saltybot_face_bridge/face_bridge_node.py b/jetson/ros2_ws/src/saltybot_face_bridge/saltybot_face_bridge/face_bridge_node.py new file mode 100644 index 0000000..abf7e61 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_face_bridge/saltybot_face_bridge/face_bridge_node.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +"""Face display bridge node. + +Maps orchestrator state to face expressions via HTTP WebSocket API. +Bridges /social/orchestrator/state and /saltybot/wake_word_detected to +face display server (localhost:3000/face/{id}). + +State mapping: + IDLE → 0 (Tracking) + LISTENING → 1 (Alert) + THINKING → 3 (Searching) + SPEAKING → 4 (Social) + Wake word → 1 (Alert) [immediate override] + +Subscribed topics: + /social/orchestrator/state (String) - JSON: {"state": "IDLE|LISTENING|THINKING|SPEAKING|THROTTLED"} + /saltybot/wake_word_detected (Bool) - Wake word detection trigger + +Published topics: + /face/state (String) - Current face expression ID and status +""" + +import json +import threading +import time +from typing import Optional + +import rclpy +from rclpy.node import Node +from std_msgs.msg import String, Bool + + +class FaceDisplayBridge(Node): + """Bridge orchestrator state to face display expressions.""" + + # State to face expression ID mapping + STATE_TO_FACE_ID = { + "IDLE": 0, # Tracking + "LISTENING": 1, # Alert + "THINKING": 3, # Searching + "SPEAKING": 4, # Social + "THROTTLED": 0, # Fallback to Tracking + } + + def __init__(self): + super().__init__("face_bridge") + + # Parameters + self.declare_parameter("face_server_url", "http://localhost:3000/face/1") + self.declare_parameter("http_timeout", 2.0) + self.declare_parameter("update_interval", 0.1) + + self.face_server_url = self.get_parameter("face_server_url").value + self.http_timeout = self.get_parameter("http_timeout").value + self.update_interval = self.get_parameter("update_interval").value + + # Try to import requests, fallback to urllib if unavailable + try: + import requests + self.requests = requests + self.use_requests = True + except ImportError: + import urllib.request + import urllib.error + self.urllib = urllib.request + self.urllib_error = urllib.error + self.use_requests = False + + # State + self.current_state = "IDLE" + self.current_face_id = 0 + self.wake_word_active = False + self.last_update_time = time.time() + self.state_lock = threading.Lock() + + # Subscriptions + self.create_subscription(String, "/social/orchestrator/state", self._on_state_update, 10) + self.create_subscription(Bool, "/saltybot/wake_word_detected", self._on_wake_word, 10) + + # Publishers + self.pub_state = self.create_publisher(String, "/face/state", 10) + + # Timer for update loop + self.create_timer(self.update_interval, self._update_face) + + self.get_logger().info( + f"Face bridge initialized: face_server_url={self.face_server_url}" + ) + + def _on_state_update(self, msg: String) -> None: + """Handle orchestrator state update.""" + try: + data = json.loads(msg.data) + new_state = data.get("state", "IDLE").upper() + + # Validate state + if new_state in self.STATE_TO_FACE_ID: + with self.state_lock: + self.current_state = new_state + self.get_logger().debug(f"State updated: {new_state}") + else: + self.get_logger().warn(f"Unknown state: {new_state}") + except json.JSONDecodeError: + self.get_logger().error(f"Invalid JSON in state update: {msg.data}") + + def _on_wake_word(self, msg: Bool) -> None: + """Handle wake word detection - immediate switch to Alert.""" + if msg.data: + with self.state_lock: + self.wake_word_active = True + self.get_logger().info("Wake word detected - switching to Alert") + + def _get_face_id(self) -> int: + """Get current face expression ID based on state.""" + with self.state_lock: + if self.wake_word_active: + face_id = 1 # Alert + # Clear wake word after one update + self.wake_word_active = False + else: + face_id = self.STATE_TO_FACE_ID.get(self.current_state, 0) + + return face_id + + def _send_face_command(self, face_id: int) -> bool: + """Send face expression command to display server. + + Args: + face_id: Expression ID (0-4) + + Returns: + True if successful + """ + try: + if self.use_requests: + response = self.requests.get( + self.face_server_url.format(id=face_id), + timeout=self.http_timeout + ) + return response.status_code == 200 + else: + url = self.face_server_url.format(id=face_id) + req = self.urllib.Request(url) + with self.urllib.urlopen(req, timeout=self.http_timeout) as response: + return response.status == 200 + except Exception as e: + self.get_logger().error(f"Failed to update face display: {e}") + return False + + def _update_face(self) -> None: + """Update face expression based on current state.""" + face_id = self._get_face_id() + + # Only send if changed + if face_id != self.current_face_id: + if self._send_face_command(face_id): + self.current_face_id = face_id + self.last_update_time = time.time() + + # Publish state + with self.state_lock: + state_msg = String( + data=json.dumps({ + "face_id": face_id, + "orchestrator_state": self.current_state, + "timestamp": self.last_update_time + }) + ) + self.pub_state.publish(state_msg) + self.get_logger().debug(f"Face updated: {face_id}") + + +def main(args=None): + rclpy.init(args=args) + node = FaceDisplayBridge() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/setup.cfg b/jetson/ros2_ws/src/saltybot_face_bridge/setup.cfg new file mode 100644 index 0000000..2923e00 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_face_bridge/setup.cfg @@ -0,0 +1,4 @@ +[develop] +script-dir=$base/lib/saltybot_face_bridge +[egg_info] +tag_date = 0 diff --git a/jetson/ros2_ws/src/saltybot_face_bridge/setup.py b/jetson/ros2_ws/src/saltybot_face_bridge/setup.py new file mode 100644 index 0000000..e297bf9 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_face_bridge/setup.py @@ -0,0 +1,27 @@ +from setuptools import setup + +package_name = "saltybot_face_bridge" + +setup( + name=package_name, + version="0.1.0", + packages=[package_name], + data_files=[ + ("share/ament_index/resource_index/packages", [f"resource/{package_name}"]), + (f"share/{package_name}", ["package.xml"]), + (f"share/{package_name}/launch", ["launch/face_bridge.launch.py"]), + (f"share/{package_name}/config", ["config/face_bridge_params.yaml"]), + ], + install_requires=["setuptools"], + zip_safe=True, + maintainer="sl-controls", + maintainer_email="sl-controls@saltylab.local", + description="Face display bridge for orchestrator state mapping", + license="MIT", + tests_require=["pytest"], + entry_points={ + "console_scripts": [ + "face_bridge_node = saltybot_face_bridge.face_bridge_node:main", + ], + }, +) diff --git a/jetson/ros2_ws/src/saltybot_social/config/wake_word_params.yaml b/jetson/ros2_ws/src/saltybot_social/config/wake_word_params.yaml index d3c5032..c49644f 100644 --- a/jetson/ros2_ws/src/saltybot_social/config/wake_word_params.yaml +++ b/jetson/ros2_ws/src/saltybot_social/config/wake_word_params.yaml @@ -13,7 +13,7 @@ wake_word_node: # Path to .npy template file (log-mel features of 'hey salty' recording). # Leave empty for passive mode (no detections fired). - template_path: "" # e.g. "/opt/saltybot/models/hey_salty.npy" + template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy" # Issue #393 n_fft: 512 # FFT size for mel spectrogram n_mels: 40 # mel filterbank bands diff --git a/jetson/ros2_ws/src/saltybot_social/models/README.md b/jetson/ros2_ws/src/saltybot_social/models/README.md new file mode 100644 index 0000000..243391e --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_social/models/README.md @@ -0,0 +1,118 @@ +# SaltyBot Wake Word Models + +## Current Model: hey_salty.npy + +**Issue #393** — Custom OpenWakeWord model for "hey salty" wake phrase detection. + +### Model Details + +- **File**: `hey_salty.npy` +- **Type**: Log-mel spectrogram template (numpy array) +- **Shape**: `(40, 61)` — 40 mel bands, ~61 time frames +- **Generation Method**: Synthetic speech using sine-wave approximation +- **Integration**: Used by `wake_word_node.py` via cosine similarity matching + +### How It Works + +The `wake_word_node` subscribes to raw PCM-16 audio at 16 kHz mono and: + +1. Maintains a sliding window of the last 1.5 seconds of audio +2. Extracts log-mel spectrogram features every 100 ms +3. Compares the log-mel features to this template via cosine similarity +4. Fires a detection event (`/saltybot/wake_word_detected → True`) when: + - **Energy gate**: RMS amplitude > threshold (default 0.02) + - **Match gate**: Cosine similarity > threshold (default 0.82) +5. Applies cooldown (default 2.0 s) to prevent rapid re-fires + +### Configuration (wake_word_params.yaml) + +```yaml +template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy" +energy_threshold: 0.02 # RMS gate +match_threshold: 0.82 # cosine-similarity threshold +cooldown_s: 2.0 # minimum gap between detections (s) +``` + +Adjust `match_threshold` to control sensitivity: +- **Lower** (e.g., 0.75) → more sensitive, higher false-positive rate +- **Higher** (e.g., 0.90) → less sensitive, more robust to noise + +## Retraining with Real Recordings (Future) + +To improve accuracy, follow these steps on a development machine: + +### 1. Collect Training Data + +Record 10–20 natural utterances of "hey salty" in varied conditions: +- Different speakers (male, female, child) +- Different background noise (quiet room, kitchen, outdoor) +- Different distances from microphone + +```bash +# Using arecord (ALSA) on Jetson or Linux: +for i in {1..20}; do + echo "Recording sample $i. Say 'hey salty'..." + arecord -r 16000 -f S16_LE -c 1 "hey_salty_${i}.wav" +done +``` + +### 2. Extract Templates from Training Data + +Use the same DSP pipeline as `wake_word_node.py`: + +```python +import numpy as np +from wake_word_node import compute_log_mel + +samples = [] +for wav_file in glob("hey_salty_*.wav"): + sr, data = scipy.io.wavfile.read(wav_file) + # Resample to 16kHz if needed + float_data = data / 32768.0 # convert PCM-16 to [-1, 1] + log_mel = compute_log_mel(float_data, sr=16000, n_fft=512, n_mels=40) + samples.append(log_mel) + +# Pad to same length, average +max_len = max(m.shape[1] for m in samples) +padded = [np.pad(m, ((0, 0), (0, max_len - m.shape[1])), mode='edge') + for m in samples] +template = np.mean(padded, axis=0).astype(np.float32) +np.save("hey_salty.npy", template) +``` + +### 3. Test and Tune + +1. Replace the current template with your new one +2. Test with `wake_word_node` in real environment +3. Adjust `match_threshold` in `wake_word_params.yaml` to find the sweet spot +4. Collect false-positive and false-negative cases; add them to training set +5. Retrain + +### 4. Version Control + +Once satisfied, replace `models/hey_salty.npy` and commit: + +```bash +git add jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy +git commit -m "refactor: hey salty template with real training data (v2)" +``` + +## Files + +- `generate_wake_word_template.py` — Script to synthesize and generate template +- `hey_salty.npy` — Current template (generated from synthetic speech) +- `README.md` — This file + +## References + +- `wake_word_node.py` — Wake word detection node (cosine similarity, energy gating) +- `wake_word_params.yaml` — Detection parameters +- `test_wake_word.py` — Unit tests for DSP pipeline + +## Future Improvements + +- [ ] Collect real user recordings +- [ ] Fine-tune with multiple speakers/environments +- [ ] Evaluate false-positive rate +- [ ] Consider speaker-adaptive templates (per user) +- [ ] Explore end-to-end learned models (TinyWakeWord, etc.) diff --git a/jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy b/jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy new file mode 100644 index 0000000..2b34efc Binary files /dev/null and b/jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy differ diff --git a/jetson/ros2_ws/src/saltybot_social/scripts/generate_wake_word_template.py b/jetson/ros2_ws/src/saltybot_social/scripts/generate_wake_word_template.py new file mode 100644 index 0000000..36d4a8a --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_social/scripts/generate_wake_word_template.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +generate_wake_word_template.py — Generate 'hey salty' wake word template for Issue #393. + +Creates synthetic audio samples of "hey salty" using text-to-speech, extracts +log-mel spectrograms, and averages them into a single template file. + +Usage: + python3 generate_wake_word_template.py --output-dir path/to/models/ + +The template is saved as hey_salty.npy (log-mel [n_mels, T] array). +""" + +import argparse +import sys +from pathlib import Path + +try: + import numpy as np +except ImportError: + print("ERROR: numpy not found. Install: pip install numpy") + sys.exit(1) + + +# ── Copy of DSP functions from wake_word_node.py ──────────────────────────────── + +def mel_filterbank(sr: int, n_fft: int, n_mels: int, + fmin: float = 80.0, fmax = None) -> np.ndarray: + """Build a triangular mel filterbank matrix [n_mels, n_fft//2+1].""" + import math + if fmax is None: + fmax = sr / 2.0 + + def hz_to_mel(hz: float) -> float: + return 2595.0 * math.log10(1.0 + hz / 700.0) + + def mel_to_hz(mel: float) -> float: + return 700.0 * (10.0 ** (mel / 2595.0) - 1.0) + + mel_lo = hz_to_mel(fmin) + mel_hi = hz_to_mel(fmax) + mel_pts = np.linspace(mel_lo, mel_hi, n_mels + 2) + hz_pts = np.array([mel_to_hz(m) for m in mel_pts]) + freqs = np.fft.rfftfreq(n_fft, d=1.0 / sr) + + fb = np.zeros((n_mels, len(freqs)), dtype=np.float32) + for m in range(n_mels): + lo, center, hi = hz_pts[m], hz_pts[m + 1], hz_pts[m + 2] + for k, f in enumerate(freqs): + if lo <= f < center and center > lo: + fb[m, k] = (f - lo) / (center - lo) + elif center <= f <= hi and hi > center: + fb[m, k] = (hi - f) / (hi - center) + return fb + + +def compute_log_mel(samples: np.ndarray, sr: int, + n_fft: int = 512, n_mels: int = 40, + hop: int = 256) -> np.ndarray: + """Return log-mel spectrogram [n_mels, T] of *samples* (float32 [-1,1]).""" + n = len(samples) + window = np.hanning(n_fft).astype(np.float32) + frames = [] + for start in range(0, max(n - n_fft + 1, 1), hop): + chunk = samples[start:start + n_fft] + if len(chunk) < n_fft: + chunk = np.pad(chunk, (0, n_fft - len(chunk))) + power = np.abs(np.fft.rfft(chunk * window)) ** 2 + frames.append(power) + frames_arr = np.array(frames, dtype=np.float32).T # [bins, T] + fb = mel_filterbank(sr, n_fft, n_mels) + mel = fb @ frames_arr # [n_mels, T] + mel = np.where(mel > 1e-10, mel, 1e-10) + return np.log(mel) + + +# ── TTS + Template Generation ────────────────────────────────────────────────── + +def generate_synthetic_speech(text: str, num_samples: int = 5) -> list: + """ + Generate synthetic speech samples of `text` using pyttsx3 or fallback. + + Returns list of float32 numpy arrays (mono, 16kHz). + """ + try: + import pyttsx3 + engine = pyttsx3.init() + engine.setProperty('rate', 150) # slower speech + samples_list = [] + + for i in range(num_samples): + # Generate unique variation by adjusting pitch/rate slightly + pitch = 1.0 + (i * 0.05 - 0.1) # ±10% pitch variation + engine.setProperty('pitch', max(0.5, min(2.0, pitch))) + + # Save to temporary WAV + wav_path = f"/tmp/hey_salty_{i}.wav" + engine.save_to_file(text, wav_path) + engine.runAndWait() + + # Load WAV and convert to 16kHz if needed + try: + import scipy.io.wavfile as wavfile + sr, data = wavfile.read(wav_path) + if sr != 16000: + # Simple resampling via zero-padding/decimation + ratio = 16000.0 / sr + new_len = int(len(data) * ratio) + indices = np.linspace(0, len(data) - 1, new_len) + data = np.interp(indices, np.arange(len(data)), data.astype(np.float32)) + # Normalize to [-1, 1] + if np.max(np.abs(data)) > 0: + data = data / (np.max(np.abs(data)) + 1e-6) + samples_list.append(data.astype(np.float32)) + except Exception as e: + print(f" Warning: could not load {wav_path}: {e}") + + if samples_list: + return samples_list + else: + raise Exception("No samples generated") + + except ImportError: + print(" pyttsx3 not available; generating synthetic sine-wave approximation...") + # Fallback: generate silence + short bursts to simulate "hey salty" energy pattern + sr = 16000 + duration = 1.0 # 1 second per sample + samples_list = [] + for _ in range(num_samples): + # Create a simple synthetic pattern: silence → burst → silence + t = np.linspace(0, duration, int(sr * duration), dtype=np.float32) + # Two "peaks" to mimic syllables "hey" and "salty" + sig = np.sin(2 * np.pi * 500 * t) * (np.exp(-((t - 0.3) ** 2) / 0.01)) + sig += np.sin(2 * np.pi * 400 * t) * (np.exp(-((t - 0.7) ** 2) / 0.02)) + sig = sig / (np.max(np.abs(sig)) + 1e-6) + samples_list.append(sig) + return samples_list + + +def main(): + parser = argparse.ArgumentParser( + description="Generate 'hey salty' wake word template for wake_word_node") + parser.add_argument("--output-dir", default="jetson/ros2_ws/src/saltybot_social/models/", + help="Directory to save hey_salty.npy") + parser.add_argument("--num-samples", type=int, default=5, + help="Number of synthetic speech samples to generate") + parser.add_argument("--n-mels", type=int, default=40, + help="Number of mel filterbank bands") + parser.add_argument("--n-fft", type=int, default=512, + help="FFT size for mel spectrogram") + + args = parser.parse_args() + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + print(f"Generating {args.num_samples} synthetic 'hey salty' samples...") + samples_list = generate_synthetic_speech("hey salty", args.num_samples) + + if not samples_list: + print("ERROR: Failed to generate samples") + sys.exit(1) + + print(f" Generated {len(samples_list)} samples") + + # Extract log-mel features for each sample + print("Extracting log-mel spectrograms...") + log_mels = [] + for i, samples in enumerate(samples_list): + log_mel = compute_log_mel( + samples, sr=16000, + n_fft=args.n_fft, n_mels=args.n_mels, hop=256 + ) + log_mels.append(log_mel) + print(f" Sample {i}: shape {log_mel.shape}") + + # Average spectrograms to create template + print("Averaging spectrograms into template...") + # Pad to same length + max_len = max(m.shape[1] for m in log_mels) + padded = [] + for log_mel in log_mels: + if log_mel.shape[1] < max_len: + pad_width = ((0, 0), (0, max_len - log_mel.shape[1])) + log_mel = np.pad(log_mel, pad_width, mode='edge') + padded.append(log_mel) + + template = np.mean(padded, axis=0).astype(np.float32) + print(f" Template shape: {template.shape}") + + # Save template + output_path = output_dir / "hey_salty.npy" + np.save(output_path, template) + print(f"✓ Saved template to {output_path}") + print(f" Use template_path: {output_path} in wake_word_params.yaml") + + +if __name__ == "__main__": + main()