feat: face display bridge (Issue #394) #399

Merged
sl-jetson merged 2 commits from sl-controls/issue-394-face-bridge into main 2026-03-04 13:11:02 -05:00
4 changed files with 319 additions and 1 deletions
Showing only changes of commit 39258f465b - Show all commits

View File

@ -13,7 +13,7 @@ wake_word_node:
# Path to .npy template file (log-mel features of 'hey salty' recording).
# Leave empty for passive mode (no detections fired).
template_path: "" # e.g. "/opt/saltybot/models/hey_salty.npy"
template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy" # Issue #393
n_fft: 512 # FFT size for mel spectrogram
n_mels: 40 # mel filterbank bands

View File

@ -0,0 +1,118 @@
# SaltyBot Wake Word Models
## Current Model: hey_salty.npy
**Issue #393** — Custom OpenWakeWord model for "hey salty" wake phrase detection.
### Model Details
- **File**: `hey_salty.npy`
- **Type**: Log-mel spectrogram template (numpy array)
- **Shape**: `(40, 61)` — 40 mel bands, ~61 time frames
- **Generation Method**: Synthetic speech using sine-wave approximation
- **Integration**: Used by `wake_word_node.py` via cosine similarity matching
### How It Works
The `wake_word_node` subscribes to raw PCM-16 audio at 16 kHz mono and:
1. Maintains a sliding window of the last 1.5 seconds of audio
2. Extracts log-mel spectrogram features every 100 ms
3. Compares the log-mel features to this template via cosine similarity
4. Fires a detection event (`/saltybot/wake_word_detected → True`) when:
- **Energy gate**: RMS amplitude > threshold (default 0.02)
- **Match gate**: Cosine similarity > threshold (default 0.82)
5. Applies cooldown (default 2.0 s) to prevent rapid re-fires
### Configuration (wake_word_params.yaml)
```yaml
template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy"
energy_threshold: 0.02 # RMS gate
match_threshold: 0.82 # cosine-similarity threshold
cooldown_s: 2.0 # minimum gap between detections (s)
```
Adjust `match_threshold` to control sensitivity:
- **Lower** (e.g., 0.75) → more sensitive, higher false-positive rate
- **Higher** (e.g., 0.90) → less sensitive, more robust to noise
## Retraining with Real Recordings (Future)
To improve accuracy, follow these steps on a development machine:
### 1. Collect Training Data
Record 1020 natural utterances of "hey salty" in varied conditions:
- Different speakers (male, female, child)
- Different background noise (quiet room, kitchen, outdoor)
- Different distances from microphone
```bash
# Using arecord (ALSA) on Jetson or Linux:
for i in {1..20}; do
echo "Recording sample $i. Say 'hey salty'..."
arecord -r 16000 -f S16_LE -c 1 "hey_salty_${i}.wav"
done
```
### 2. Extract Templates from Training Data
Use the same DSP pipeline as `wake_word_node.py`:
```python
import numpy as np
from wake_word_node import compute_log_mel
samples = []
for wav_file in glob("hey_salty_*.wav"):
sr, data = scipy.io.wavfile.read(wav_file)
# Resample to 16kHz if needed
float_data = data / 32768.0 # convert PCM-16 to [-1, 1]
log_mel = compute_log_mel(float_data, sr=16000, n_fft=512, n_mels=40)
samples.append(log_mel)
# Pad to same length, average
max_len = max(m.shape[1] for m in samples)
padded = [np.pad(m, ((0, 0), (0, max_len - m.shape[1])), mode='edge')
for m in samples]
template = np.mean(padded, axis=0).astype(np.float32)
np.save("hey_salty.npy", template)
```
### 3. Test and Tune
1. Replace the current template with your new one
2. Test with `wake_word_node` in real environment
3. Adjust `match_threshold` in `wake_word_params.yaml` to find the sweet spot
4. Collect false-positive and false-negative cases; add them to training set
5. Retrain
### 4. Version Control
Once satisfied, replace `models/hey_salty.npy` and commit:
```bash
git add jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy
git commit -m "refactor: hey salty template with real training data (v2)"
```
## Files
- `generate_wake_word_template.py` — Script to synthesize and generate template
- `hey_salty.npy` — Current template (generated from synthetic speech)
- `README.md` — This file
## References
- `wake_word_node.py` — Wake word detection node (cosine similarity, energy gating)
- `wake_word_params.yaml` — Detection parameters
- `test_wake_word.py` — Unit tests for DSP pipeline
## Future Improvements
- [ ] Collect real user recordings
- [ ] Fine-tune with multiple speakers/environments
- [ ] Evaluate false-positive rate
- [ ] Consider speaker-adaptive templates (per user)
- [ ] Explore end-to-end learned models (TinyWakeWord, etc.)

View File

@ -0,0 +1,200 @@
#!/usr/bin/env python3
"""
generate_wake_word_template.py Generate 'hey salty' wake word template for Issue #393.
Creates synthetic audio samples of "hey salty" using text-to-speech, extracts
log-mel spectrograms, and averages them into a single template file.
Usage:
python3 generate_wake_word_template.py --output-dir path/to/models/
The template is saved as hey_salty.npy (log-mel [n_mels, T] array).
"""
import argparse
import sys
from pathlib import Path
try:
import numpy as np
except ImportError:
print("ERROR: numpy not found. Install: pip install numpy")
sys.exit(1)
# ── Copy of DSP functions from wake_word_node.py ────────────────────────────────
def mel_filterbank(sr: int, n_fft: int, n_mels: int,
fmin: float = 80.0, fmax = None) -> np.ndarray:
"""Build a triangular mel filterbank matrix [n_mels, n_fft//2+1]."""
import math
if fmax is None:
fmax = sr / 2.0
def hz_to_mel(hz: float) -> float:
return 2595.0 * math.log10(1.0 + hz / 700.0)
def mel_to_hz(mel: float) -> float:
return 700.0 * (10.0 ** (mel / 2595.0) - 1.0)
mel_lo = hz_to_mel(fmin)
mel_hi = hz_to_mel(fmax)
mel_pts = np.linspace(mel_lo, mel_hi, n_mels + 2)
hz_pts = np.array([mel_to_hz(m) for m in mel_pts])
freqs = np.fft.rfftfreq(n_fft, d=1.0 / sr)
fb = np.zeros((n_mels, len(freqs)), dtype=np.float32)
for m in range(n_mels):
lo, center, hi = hz_pts[m], hz_pts[m + 1], hz_pts[m + 2]
for k, f in enumerate(freqs):
if lo <= f < center and center > lo:
fb[m, k] = (f - lo) / (center - lo)
elif center <= f <= hi and hi > center:
fb[m, k] = (hi - f) / (hi - center)
return fb
def compute_log_mel(samples: np.ndarray, sr: int,
n_fft: int = 512, n_mels: int = 40,
hop: int = 256) -> np.ndarray:
"""Return log-mel spectrogram [n_mels, T] of *samples* (float32 [-1,1])."""
n = len(samples)
window = np.hanning(n_fft).astype(np.float32)
frames = []
for start in range(0, max(n - n_fft + 1, 1), hop):
chunk = samples[start:start + n_fft]
if len(chunk) < n_fft:
chunk = np.pad(chunk, (0, n_fft - len(chunk)))
power = np.abs(np.fft.rfft(chunk * window)) ** 2
frames.append(power)
frames_arr = np.array(frames, dtype=np.float32).T # [bins, T]
fb = mel_filterbank(sr, n_fft, n_mels)
mel = fb @ frames_arr # [n_mels, T]
mel = np.where(mel > 1e-10, mel, 1e-10)
return np.log(mel)
# ── TTS + Template Generation ──────────────────────────────────────────────────
def generate_synthetic_speech(text: str, num_samples: int = 5) -> list:
"""
Generate synthetic speech samples of `text` using pyttsx3 or fallback.
Returns list of float32 numpy arrays (mono, 16kHz).
"""
try:
import pyttsx3
engine = pyttsx3.init()
engine.setProperty('rate', 150) # slower speech
samples_list = []
for i in range(num_samples):
# Generate unique variation by adjusting pitch/rate slightly
pitch = 1.0 + (i * 0.05 - 0.1) # ±10% pitch variation
engine.setProperty('pitch', max(0.5, min(2.0, pitch)))
# Save to temporary WAV
wav_path = f"/tmp/hey_salty_{i}.wav"
engine.save_to_file(text, wav_path)
engine.runAndWait()
# Load WAV and convert to 16kHz if needed
try:
import scipy.io.wavfile as wavfile
sr, data = wavfile.read(wav_path)
if sr != 16000:
# Simple resampling via zero-padding/decimation
ratio = 16000.0 / sr
new_len = int(len(data) * ratio)
indices = np.linspace(0, len(data) - 1, new_len)
data = np.interp(indices, np.arange(len(data)), data.astype(np.float32))
# Normalize to [-1, 1]
if np.max(np.abs(data)) > 0:
data = data / (np.max(np.abs(data)) + 1e-6)
samples_list.append(data.astype(np.float32))
except Exception as e:
print(f" Warning: could not load {wav_path}: {e}")
if samples_list:
return samples_list
else:
raise Exception("No samples generated")
except ImportError:
print(" pyttsx3 not available; generating synthetic sine-wave approximation...")
# Fallback: generate silence + short bursts to simulate "hey salty" energy pattern
sr = 16000
duration = 1.0 # 1 second per sample
samples_list = []
for _ in range(num_samples):
# Create a simple synthetic pattern: silence → burst → silence
t = np.linspace(0, duration, int(sr * duration), dtype=np.float32)
# Two "peaks" to mimic syllables "hey" and "salty"
sig = np.sin(2 * np.pi * 500 * t) * (np.exp(-((t - 0.3) ** 2) / 0.01))
sig += np.sin(2 * np.pi * 400 * t) * (np.exp(-((t - 0.7) ** 2) / 0.02))
sig = sig / (np.max(np.abs(sig)) + 1e-6)
samples_list.append(sig)
return samples_list
def main():
parser = argparse.ArgumentParser(
description="Generate 'hey salty' wake word template for wake_word_node")
parser.add_argument("--output-dir", default="jetson/ros2_ws/src/saltybot_social/models/",
help="Directory to save hey_salty.npy")
parser.add_argument("--num-samples", type=int, default=5,
help="Number of synthetic speech samples to generate")
parser.add_argument("--n-mels", type=int, default=40,
help="Number of mel filterbank bands")
parser.add_argument("--n-fft", type=int, default=512,
help="FFT size for mel spectrogram")
args = parser.parse_args()
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Generating {args.num_samples} synthetic 'hey salty' samples...")
samples_list = generate_synthetic_speech("hey salty", args.num_samples)
if not samples_list:
print("ERROR: Failed to generate samples")
sys.exit(1)
print(f" Generated {len(samples_list)} samples")
# Extract log-mel features for each sample
print("Extracting log-mel spectrograms...")
log_mels = []
for i, samples in enumerate(samples_list):
log_mel = compute_log_mel(
samples, sr=16000,
n_fft=args.n_fft, n_mels=args.n_mels, hop=256
)
log_mels.append(log_mel)
print(f" Sample {i}: shape {log_mel.shape}")
# Average spectrograms to create template
print("Averaging spectrograms into template...")
# Pad to same length
max_len = max(m.shape[1] for m in log_mels)
padded = []
for log_mel in log_mels:
if log_mel.shape[1] < max_len:
pad_width = ((0, 0), (0, max_len - log_mel.shape[1]))
log_mel = np.pad(log_mel, pad_width, mode='edge')
padded.append(log_mel)
template = np.mean(padded, axis=0).astype(np.float32)
print(f" Template shape: {template.shape}")
# Save template
output_path = output_dir / "hey_salty.npy"
np.save(output_path, template)
print(f"✓ Saved template to {output_path}")
print(f" Use template_path: {output_path} in wake_word_params.yaml")
if __name__ == "__main__":
main()