Compare commits
No commits in common. "65ec1151f8fa9df199d7fac43e4fb6c6887386d8" and "7bc2b64c1db5d0253d61e3ff3b98852e146d1735" have entirely different histories.
65ec1151f8
...
7bc2b64c1d
@ -1,13 +0,0 @@
|
|||||||
face_bridge:
|
|
||||||
# HTTP server endpoint for face display
|
|
||||||
face_server_url: "http://localhost:3000/face/{id}" # {id} replaced with expression ID
|
|
||||||
|
|
||||||
# HTTP request settings
|
|
||||||
http_timeout: 2.0 # Request timeout in seconds
|
|
||||||
update_interval: 0.1 # Update check interval in seconds
|
|
||||||
|
|
||||||
# State to expression mapping:
|
|
||||||
# 0 = Tracking (IDLE, THROTTLED)
|
|
||||||
# 1 = Alert (LISTENING, wake word)
|
|
||||||
# 3 = Searching (THINKING)
|
|
||||||
# 4 = Social (SPEAKING)
|
|
||||||
@ -1,40 +0,0 @@
|
|||||||
"""Launch file for face display bridge node."""
|
|
||||||
|
|
||||||
from launch import LaunchDescription
|
|
||||||
from launch_ros.actions import Node
|
|
||||||
from launch.substitutions import LaunchConfiguration
|
|
||||||
from launch.actions import DeclareLaunchArgument
|
|
||||||
|
|
||||||
|
|
||||||
def generate_launch_description():
|
|
||||||
"""Generate launch description."""
|
|
||||||
# Declare arguments
|
|
||||||
url_arg = DeclareLaunchArgument(
|
|
||||||
"face_server_url",
|
|
||||||
default_value="http://localhost:3000/face/{id}",
|
|
||||||
description="Face display server HTTP endpoint"
|
|
||||||
)
|
|
||||||
timeout_arg = DeclareLaunchArgument(
|
|
||||||
"http_timeout",
|
|
||||||
default_value="2.0",
|
|
||||||
description="HTTP request timeout in seconds"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create node
|
|
||||||
face_bridge_node = Node(
|
|
||||||
package="saltybot_face_bridge",
|
|
||||||
executable="face_bridge_node",
|
|
||||||
name="face_bridge",
|
|
||||||
parameters=[
|
|
||||||
{"face_server_url": LaunchConfiguration("face_server_url")},
|
|
||||||
{"http_timeout": LaunchConfiguration("http_timeout")},
|
|
||||||
{"update_interval": 0.1},
|
|
||||||
],
|
|
||||||
output="screen",
|
|
||||||
)
|
|
||||||
|
|
||||||
return LaunchDescription([
|
|
||||||
url_arg,
|
|
||||||
timeout_arg,
|
|
||||||
face_bridge_node,
|
|
||||||
])
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
<?xml version="1.0"?>
|
|
||||||
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
|
|
||||||
<package format="3">
|
|
||||||
<name>saltybot_face_bridge</name>
|
|
||||||
<version>0.1.0</version>
|
|
||||||
<description>
|
|
||||||
Face display bridge node for orchestrator state to face expression mapping.
|
|
||||||
Maps social/orchestrator state to face display WebSocket API.
|
|
||||||
</description>
|
|
||||||
<maintainer email="sl-controls@saltylab.local">sl-controls</maintainer>
|
|
||||||
<license>MIT</license>
|
|
||||||
|
|
||||||
<depend>rclpy</depend>
|
|
||||||
<depend>std_msgs</depend>
|
|
||||||
|
|
||||||
<buildtool_depend>ament_python</buildtool_depend>
|
|
||||||
|
|
||||||
<test_depend>ament_copyright</test_depend>
|
|
||||||
<test_depend>ament_flake8</test_depend>
|
|
||||||
<test_depend>ament_pep257</test_depend>
|
|
||||||
<test_depend>python3-pytest</test_depend>
|
|
||||||
|
|
||||||
<export>
|
|
||||||
<build_type>ament_python</build_type>
|
|
||||||
</export>
|
|
||||||
</package>
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
[develop]
|
|
||||||
script-dir=$base/lib/saltybot_face_bridge
|
|
||||||
[egg_info]
|
|
||||||
tag_date = 0
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
from setuptools import setup
|
|
||||||
|
|
||||||
package_name = "saltybot_face_bridge"
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name=package_name,
|
|
||||||
version="0.1.0",
|
|
||||||
packages=[package_name],
|
|
||||||
data_files=[
|
|
||||||
("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
|
|
||||||
(f"share/{package_name}", ["package.xml"]),
|
|
||||||
(f"share/{package_name}/launch", ["launch/face_bridge.launch.py"]),
|
|
||||||
(f"share/{package_name}/config", ["config/face_bridge_params.yaml"]),
|
|
||||||
],
|
|
||||||
install_requires=["setuptools"],
|
|
||||||
zip_safe=True,
|
|
||||||
maintainer="sl-controls",
|
|
||||||
maintainer_email="sl-controls@saltylab.local",
|
|
||||||
description="Face display bridge for orchestrator state mapping",
|
|
||||||
license="MIT",
|
|
||||||
tests_require=["pytest"],
|
|
||||||
entry_points={
|
|
||||||
"console_scripts": [
|
|
||||||
"face_bridge_node = saltybot_face_bridge.face_bridge_node:main",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
@ -13,7 +13,7 @@ wake_word_node:
|
|||||||
|
|
||||||
# Path to .npy template file (log-mel features of 'hey salty' recording).
|
# Path to .npy template file (log-mel features of 'hey salty' recording).
|
||||||
# Leave empty for passive mode (no detections fired).
|
# Leave empty for passive mode (no detections fired).
|
||||||
template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy" # Issue #393
|
template_path: "" # e.g. "/opt/saltybot/models/hey_salty.npy"
|
||||||
|
|
||||||
n_fft: 512 # FFT size for mel spectrogram
|
n_fft: 512 # FFT size for mel spectrogram
|
||||||
n_mels: 40 # mel filterbank bands
|
n_mels: 40 # mel filterbank bands
|
||||||
|
|||||||
@ -1,118 +0,0 @@
|
|||||||
# SaltyBot Wake Word Models
|
|
||||||
|
|
||||||
## Current Model: hey_salty.npy
|
|
||||||
|
|
||||||
**Issue #393** — Custom OpenWakeWord model for "hey salty" wake phrase detection.
|
|
||||||
|
|
||||||
### Model Details
|
|
||||||
|
|
||||||
- **File**: `hey_salty.npy`
|
|
||||||
- **Type**: Log-mel spectrogram template (numpy array)
|
|
||||||
- **Shape**: `(40, 61)` — 40 mel bands, ~61 time frames
|
|
||||||
- **Generation Method**: Synthetic speech using sine-wave approximation
|
|
||||||
- **Integration**: Used by `wake_word_node.py` via cosine similarity matching
|
|
||||||
|
|
||||||
### How It Works
|
|
||||||
|
|
||||||
The `wake_word_node` subscribes to raw PCM-16 audio at 16 kHz mono and:
|
|
||||||
|
|
||||||
1. Maintains a sliding window of the last 1.5 seconds of audio
|
|
||||||
2. Extracts log-mel spectrogram features every 100 ms
|
|
||||||
3. Compares the log-mel features to this template via cosine similarity
|
|
||||||
4. Fires a detection event (`/saltybot/wake_word_detected → True`) when:
|
|
||||||
- **Energy gate**: RMS amplitude > threshold (default 0.02)
|
|
||||||
- **Match gate**: Cosine similarity > threshold (default 0.82)
|
|
||||||
5. Applies cooldown (default 2.0 s) to prevent rapid re-fires
|
|
||||||
|
|
||||||
### Configuration (wake_word_params.yaml)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy"
|
|
||||||
energy_threshold: 0.02 # RMS gate
|
|
||||||
match_threshold: 0.82 # cosine-similarity threshold
|
|
||||||
cooldown_s: 2.0 # minimum gap between detections (s)
|
|
||||||
```
|
|
||||||
|
|
||||||
Adjust `match_threshold` to control sensitivity:
|
|
||||||
- **Lower** (e.g., 0.75) → more sensitive, higher false-positive rate
|
|
||||||
- **Higher** (e.g., 0.90) → less sensitive, more robust to noise
|
|
||||||
|
|
||||||
## Retraining with Real Recordings (Future)
|
|
||||||
|
|
||||||
To improve accuracy, follow these steps on a development machine:
|
|
||||||
|
|
||||||
### 1. Collect Training Data
|
|
||||||
|
|
||||||
Record 10–20 natural utterances of "hey salty" in varied conditions:
|
|
||||||
- Different speakers (male, female, child)
|
|
||||||
- Different background noise (quiet room, kitchen, outdoor)
|
|
||||||
- Different distances from microphone
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Using arecord (ALSA) on Jetson or Linux:
|
|
||||||
for i in {1..20}; do
|
|
||||||
echo "Recording sample $i. Say 'hey salty'..."
|
|
||||||
arecord -r 16000 -f S16_LE -c 1 "hey_salty_${i}.wav"
|
|
||||||
done
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Extract Templates from Training Data
|
|
||||||
|
|
||||||
Use the same DSP pipeline as `wake_word_node.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import numpy as np
|
|
||||||
from wake_word_node import compute_log_mel
|
|
||||||
|
|
||||||
samples = []
|
|
||||||
for wav_file in glob("hey_salty_*.wav"):
|
|
||||||
sr, data = scipy.io.wavfile.read(wav_file)
|
|
||||||
# Resample to 16kHz if needed
|
|
||||||
float_data = data / 32768.0 # convert PCM-16 to [-1, 1]
|
|
||||||
log_mel = compute_log_mel(float_data, sr=16000, n_fft=512, n_mels=40)
|
|
||||||
samples.append(log_mel)
|
|
||||||
|
|
||||||
# Pad to same length, average
|
|
||||||
max_len = max(m.shape[1] for m in samples)
|
|
||||||
padded = [np.pad(m, ((0, 0), (0, max_len - m.shape[1])), mode='edge')
|
|
||||||
for m in samples]
|
|
||||||
template = np.mean(padded, axis=0).astype(np.float32)
|
|
||||||
np.save("hey_salty.npy", template)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Test and Tune
|
|
||||||
|
|
||||||
1. Replace the current template with your new one
|
|
||||||
2. Test with `wake_word_node` in real environment
|
|
||||||
3. Adjust `match_threshold` in `wake_word_params.yaml` to find the sweet spot
|
|
||||||
4. Collect false-positive and false-negative cases; add them to training set
|
|
||||||
5. Retrain
|
|
||||||
|
|
||||||
### 4. Version Control
|
|
||||||
|
|
||||||
Once satisfied, replace `models/hey_salty.npy` and commit:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy
|
|
||||||
git commit -m "refactor: hey salty template with real training data (v2)"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Files
|
|
||||||
|
|
||||||
- `generate_wake_word_template.py` — Script to synthesize and generate template
|
|
||||||
- `hey_salty.npy` — Current template (generated from synthetic speech)
|
|
||||||
- `README.md` — This file
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- `wake_word_node.py` — Wake word detection node (cosine similarity, energy gating)
|
|
||||||
- `wake_word_params.yaml` — Detection parameters
|
|
||||||
- `test_wake_word.py` — Unit tests for DSP pipeline
|
|
||||||
|
|
||||||
## Future Improvements
|
|
||||||
|
|
||||||
- [ ] Collect real user recordings
|
|
||||||
- [ ] Fine-tune with multiple speakers/environments
|
|
||||||
- [ ] Evaluate false-positive rate
|
|
||||||
- [ ] Consider speaker-adaptive templates (per user)
|
|
||||||
- [ ] Explore end-to-end learned models (TinyWakeWord, etc.)
|
|
||||||
Loading…
x
Reference in New Issue
Block a user