Compare commits
2 Commits
858ae1e7b9
...
39258f465b
| Author | SHA1 | Date | |
|---|---|---|---|
| 39258f465b | |||
| 82cb2bde79 |
@ -0,0 +1,13 @@
|
|||||||
|
face_bridge:
|
||||||
|
# HTTP server endpoint for face display
|
||||||
|
face_server_url: "http://localhost:3000/face/{id}" # {id} replaced with expression ID
|
||||||
|
|
||||||
|
# HTTP request settings
|
||||||
|
http_timeout: 2.0 # Request timeout in seconds
|
||||||
|
update_interval: 0.1 # Update check interval in seconds
|
||||||
|
|
||||||
|
# State to expression mapping:
|
||||||
|
# 0 = Tracking (IDLE, THROTTLED)
|
||||||
|
# 1 = Alert (LISTENING, wake word)
|
||||||
|
# 3 = Searching (THINKING)
|
||||||
|
# 4 = Social (SPEAKING)
|
||||||
@ -0,0 +1,40 @@
|
|||||||
|
"""Launch file for face display bridge node."""
|
||||||
|
|
||||||
|
from launch import LaunchDescription
|
||||||
|
from launch_ros.actions import Node
|
||||||
|
from launch.substitutions import LaunchConfiguration
|
||||||
|
from launch.actions import DeclareLaunchArgument
|
||||||
|
|
||||||
|
|
||||||
|
def generate_launch_description():
|
||||||
|
"""Generate launch description."""
|
||||||
|
# Declare arguments
|
||||||
|
url_arg = DeclareLaunchArgument(
|
||||||
|
"face_server_url",
|
||||||
|
default_value="http://localhost:3000/face/{id}",
|
||||||
|
description="Face display server HTTP endpoint"
|
||||||
|
)
|
||||||
|
timeout_arg = DeclareLaunchArgument(
|
||||||
|
"http_timeout",
|
||||||
|
default_value="2.0",
|
||||||
|
description="HTTP request timeout in seconds"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create node
|
||||||
|
face_bridge_node = Node(
|
||||||
|
package="saltybot_face_bridge",
|
||||||
|
executable="face_bridge_node",
|
||||||
|
name="face_bridge",
|
||||||
|
parameters=[
|
||||||
|
{"face_server_url": LaunchConfiguration("face_server_url")},
|
||||||
|
{"http_timeout": LaunchConfiguration("http_timeout")},
|
||||||
|
{"update_interval": 0.1},
|
||||||
|
],
|
||||||
|
output="screen",
|
||||||
|
)
|
||||||
|
|
||||||
|
return LaunchDescription([
|
||||||
|
url_arg,
|
||||||
|
timeout_arg,
|
||||||
|
face_bridge_node,
|
||||||
|
])
|
||||||
26
jetson/ros2_ws/src/saltybot_face_bridge/package.xml
Normal file
26
jetson/ros2_ws/src/saltybot_face_bridge/package.xml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
|
||||||
|
<package format="3">
|
||||||
|
<name>saltybot_face_bridge</name>
|
||||||
|
<version>0.1.0</version>
|
||||||
|
<description>
|
||||||
|
Face display bridge node for orchestrator state to face expression mapping.
|
||||||
|
Maps social/orchestrator state to face display WebSocket API.
|
||||||
|
</description>
|
||||||
|
<maintainer email="sl-controls@saltylab.local">sl-controls</maintainer>
|
||||||
|
<license>MIT</license>
|
||||||
|
|
||||||
|
<depend>rclpy</depend>
|
||||||
|
<depend>std_msgs</depend>
|
||||||
|
|
||||||
|
<buildtool_depend>ament_python</buildtool_depend>
|
||||||
|
|
||||||
|
<test_depend>ament_copyright</test_depend>
|
||||||
|
<test_depend>ament_flake8</test_depend>
|
||||||
|
<test_depend>ament_pep257</test_depend>
|
||||||
|
<test_depend>python3-pytest</test_depend>
|
||||||
|
|
||||||
|
<export>
|
||||||
|
<build_type>ament_python</build_type>
|
||||||
|
</export>
|
||||||
|
</package>
|
||||||
@ -0,0 +1,186 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Face display bridge node.
|
||||||
|
|
||||||
|
Maps orchestrator state to face expressions via HTTP WebSocket API.
|
||||||
|
Bridges /social/orchestrator/state and /saltybot/wake_word_detected to
|
||||||
|
face display server (localhost:3000/face/{id}).
|
||||||
|
|
||||||
|
State mapping:
|
||||||
|
IDLE → 0 (Tracking)
|
||||||
|
LISTENING → 1 (Alert)
|
||||||
|
THINKING → 3 (Searching)
|
||||||
|
SPEAKING → 4 (Social)
|
||||||
|
Wake word → 1 (Alert) [immediate override]
|
||||||
|
|
||||||
|
Subscribed topics:
|
||||||
|
/social/orchestrator/state (String) - JSON: {"state": "IDLE|LISTENING|THINKING|SPEAKING|THROTTLED"}
|
||||||
|
/saltybot/wake_word_detected (Bool) - Wake word detection trigger
|
||||||
|
|
||||||
|
Published topics:
|
||||||
|
/face/state (String) - Current face expression ID and status
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import rclpy
|
||||||
|
from rclpy.node import Node
|
||||||
|
from std_msgs.msg import String, Bool
|
||||||
|
|
||||||
|
|
||||||
|
class FaceDisplayBridge(Node):
|
||||||
|
"""Bridge orchestrator state to face display expressions."""
|
||||||
|
|
||||||
|
# State to face expression ID mapping
|
||||||
|
STATE_TO_FACE_ID = {
|
||||||
|
"IDLE": 0, # Tracking
|
||||||
|
"LISTENING": 1, # Alert
|
||||||
|
"THINKING": 3, # Searching
|
||||||
|
"SPEAKING": 4, # Social
|
||||||
|
"THROTTLED": 0, # Fallback to Tracking
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__("face_bridge")
|
||||||
|
|
||||||
|
# Parameters
|
||||||
|
self.declare_parameter("face_server_url", "http://localhost:3000/face/1")
|
||||||
|
self.declare_parameter("http_timeout", 2.0)
|
||||||
|
self.declare_parameter("update_interval", 0.1)
|
||||||
|
|
||||||
|
self.face_server_url = self.get_parameter("face_server_url").value
|
||||||
|
self.http_timeout = self.get_parameter("http_timeout").value
|
||||||
|
self.update_interval = self.get_parameter("update_interval").value
|
||||||
|
|
||||||
|
# Try to import requests, fallback to urllib if unavailable
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
self.requests = requests
|
||||||
|
self.use_requests = True
|
||||||
|
except ImportError:
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
self.urllib = urllib.request
|
||||||
|
self.urllib_error = urllib.error
|
||||||
|
self.use_requests = False
|
||||||
|
|
||||||
|
# State
|
||||||
|
self.current_state = "IDLE"
|
||||||
|
self.current_face_id = 0
|
||||||
|
self.wake_word_active = False
|
||||||
|
self.last_update_time = time.time()
|
||||||
|
self.state_lock = threading.Lock()
|
||||||
|
|
||||||
|
# Subscriptions
|
||||||
|
self.create_subscription(String, "/social/orchestrator/state", self._on_state_update, 10)
|
||||||
|
self.create_subscription(Bool, "/saltybot/wake_word_detected", self._on_wake_word, 10)
|
||||||
|
|
||||||
|
# Publishers
|
||||||
|
self.pub_state = self.create_publisher(String, "/face/state", 10)
|
||||||
|
|
||||||
|
# Timer for update loop
|
||||||
|
self.create_timer(self.update_interval, self._update_face)
|
||||||
|
|
||||||
|
self.get_logger().info(
|
||||||
|
f"Face bridge initialized: face_server_url={self.face_server_url}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _on_state_update(self, msg: String) -> None:
|
||||||
|
"""Handle orchestrator state update."""
|
||||||
|
try:
|
||||||
|
data = json.loads(msg.data)
|
||||||
|
new_state = data.get("state", "IDLE").upper()
|
||||||
|
|
||||||
|
# Validate state
|
||||||
|
if new_state in self.STATE_TO_FACE_ID:
|
||||||
|
with self.state_lock:
|
||||||
|
self.current_state = new_state
|
||||||
|
self.get_logger().debug(f"State updated: {new_state}")
|
||||||
|
else:
|
||||||
|
self.get_logger().warn(f"Unknown state: {new_state}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.get_logger().error(f"Invalid JSON in state update: {msg.data}")
|
||||||
|
|
||||||
|
def _on_wake_word(self, msg: Bool) -> None:
|
||||||
|
"""Handle wake word detection - immediate switch to Alert."""
|
||||||
|
if msg.data:
|
||||||
|
with self.state_lock:
|
||||||
|
self.wake_word_active = True
|
||||||
|
self.get_logger().info("Wake word detected - switching to Alert")
|
||||||
|
|
||||||
|
def _get_face_id(self) -> int:
|
||||||
|
"""Get current face expression ID based on state."""
|
||||||
|
with self.state_lock:
|
||||||
|
if self.wake_word_active:
|
||||||
|
face_id = 1 # Alert
|
||||||
|
# Clear wake word after one update
|
||||||
|
self.wake_word_active = False
|
||||||
|
else:
|
||||||
|
face_id = self.STATE_TO_FACE_ID.get(self.current_state, 0)
|
||||||
|
|
||||||
|
return face_id
|
||||||
|
|
||||||
|
def _send_face_command(self, face_id: int) -> bool:
|
||||||
|
"""Send face expression command to display server.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
face_id: Expression ID (0-4)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successful
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if self.use_requests:
|
||||||
|
response = self.requests.get(
|
||||||
|
self.face_server_url.format(id=face_id),
|
||||||
|
timeout=self.http_timeout
|
||||||
|
)
|
||||||
|
return response.status_code == 200
|
||||||
|
else:
|
||||||
|
url = self.face_server_url.format(id=face_id)
|
||||||
|
req = self.urllib.Request(url)
|
||||||
|
with self.urllib.urlopen(req, timeout=self.http_timeout) as response:
|
||||||
|
return response.status == 200
|
||||||
|
except Exception as e:
|
||||||
|
self.get_logger().error(f"Failed to update face display: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _update_face(self) -> None:
|
||||||
|
"""Update face expression based on current state."""
|
||||||
|
face_id = self._get_face_id()
|
||||||
|
|
||||||
|
# Only send if changed
|
||||||
|
if face_id != self.current_face_id:
|
||||||
|
if self._send_face_command(face_id):
|
||||||
|
self.current_face_id = face_id
|
||||||
|
self.last_update_time = time.time()
|
||||||
|
|
||||||
|
# Publish state
|
||||||
|
with self.state_lock:
|
||||||
|
state_msg = String(
|
||||||
|
data=json.dumps({
|
||||||
|
"face_id": face_id,
|
||||||
|
"orchestrator_state": self.current_state,
|
||||||
|
"timestamp": self.last_update_time
|
||||||
|
})
|
||||||
|
)
|
||||||
|
self.pub_state.publish(state_msg)
|
||||||
|
self.get_logger().debug(f"Face updated: {face_id}")
|
||||||
|
|
||||||
|
|
||||||
|
def main(args=None):
|
||||||
|
rclpy.init(args=args)
|
||||||
|
node = FaceDisplayBridge()
|
||||||
|
try:
|
||||||
|
rclpy.spin(node)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
node.destroy_node()
|
||||||
|
rclpy.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
4
jetson/ros2_ws/src/saltybot_face_bridge/setup.cfg
Normal file
4
jetson/ros2_ws/src/saltybot_face_bridge/setup.cfg
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
[develop]
|
||||||
|
script-dir=$base/lib/saltybot_face_bridge
|
||||||
|
[egg_info]
|
||||||
|
tag_date = 0
|
||||||
27
jetson/ros2_ws/src/saltybot_face_bridge/setup.py
Normal file
27
jetson/ros2_ws/src/saltybot_face_bridge/setup.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
package_name = "saltybot_face_bridge"
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name=package_name,
|
||||||
|
version="0.1.0",
|
||||||
|
packages=[package_name],
|
||||||
|
data_files=[
|
||||||
|
("share/ament_index/resource_index/packages", [f"resource/{package_name}"]),
|
||||||
|
(f"share/{package_name}", ["package.xml"]),
|
||||||
|
(f"share/{package_name}/launch", ["launch/face_bridge.launch.py"]),
|
||||||
|
(f"share/{package_name}/config", ["config/face_bridge_params.yaml"]),
|
||||||
|
],
|
||||||
|
install_requires=["setuptools"],
|
||||||
|
zip_safe=True,
|
||||||
|
maintainer="sl-controls",
|
||||||
|
maintainer_email="sl-controls@saltylab.local",
|
||||||
|
description="Face display bridge for orchestrator state mapping",
|
||||||
|
license="MIT",
|
||||||
|
tests_require=["pytest"],
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
"face_bridge_node = saltybot_face_bridge.face_bridge_node:main",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
@ -13,7 +13,7 @@ wake_word_node:
|
|||||||
|
|
||||||
# Path to .npy template file (log-mel features of 'hey salty' recording).
|
# Path to .npy template file (log-mel features of 'hey salty' recording).
|
||||||
# Leave empty for passive mode (no detections fired).
|
# Leave empty for passive mode (no detections fired).
|
||||||
template_path: "" # e.g. "/opt/saltybot/models/hey_salty.npy"
|
template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy" # Issue #393
|
||||||
|
|
||||||
n_fft: 512 # FFT size for mel spectrogram
|
n_fft: 512 # FFT size for mel spectrogram
|
||||||
n_mels: 40 # mel filterbank bands
|
n_mels: 40 # mel filterbank bands
|
||||||
|
|||||||
118
jetson/ros2_ws/src/saltybot_social/models/README.md
Normal file
118
jetson/ros2_ws/src/saltybot_social/models/README.md
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
# SaltyBot Wake Word Models
|
||||||
|
|
||||||
|
## Current Model: hey_salty.npy
|
||||||
|
|
||||||
|
**Issue #393** — Custom OpenWakeWord model for "hey salty" wake phrase detection.
|
||||||
|
|
||||||
|
### Model Details
|
||||||
|
|
||||||
|
- **File**: `hey_salty.npy`
|
||||||
|
- **Type**: Log-mel spectrogram template (numpy array)
|
||||||
|
- **Shape**: `(40, 61)` — 40 mel bands, ~61 time frames
|
||||||
|
- **Generation Method**: Synthetic speech using sine-wave approximation
|
||||||
|
- **Integration**: Used by `wake_word_node.py` via cosine similarity matching
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
The `wake_word_node` subscribes to raw PCM-16 audio at 16 kHz mono and:
|
||||||
|
|
||||||
|
1. Maintains a sliding window of the last 1.5 seconds of audio
|
||||||
|
2. Extracts log-mel spectrogram features every 100 ms
|
||||||
|
3. Compares the log-mel features to this template via cosine similarity
|
||||||
|
4. Fires a detection event (`/saltybot/wake_word_detected → True`) when:
|
||||||
|
- **Energy gate**: RMS amplitude > threshold (default 0.02)
|
||||||
|
- **Match gate**: Cosine similarity > threshold (default 0.82)
|
||||||
|
5. Applies cooldown (default 2.0 s) to prevent rapid re-fires
|
||||||
|
|
||||||
|
### Configuration (wake_word_params.yaml)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
template_path: "jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy"
|
||||||
|
energy_threshold: 0.02 # RMS gate
|
||||||
|
match_threshold: 0.82 # cosine-similarity threshold
|
||||||
|
cooldown_s: 2.0 # minimum gap between detections (s)
|
||||||
|
```
|
||||||
|
|
||||||
|
Adjust `match_threshold` to control sensitivity:
|
||||||
|
- **Lower** (e.g., 0.75) → more sensitive, higher false-positive rate
|
||||||
|
- **Higher** (e.g., 0.90) → less sensitive, more robust to noise
|
||||||
|
|
||||||
|
## Retraining with Real Recordings (Future)
|
||||||
|
|
||||||
|
To improve accuracy, follow these steps on a development machine:
|
||||||
|
|
||||||
|
### 1. Collect Training Data
|
||||||
|
|
||||||
|
Record 10–20 natural utterances of "hey salty" in varied conditions:
|
||||||
|
- Different speakers (male, female, child)
|
||||||
|
- Different background noise (quiet room, kitchen, outdoor)
|
||||||
|
- Different distances from microphone
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Using arecord (ALSA) on Jetson or Linux:
|
||||||
|
for i in {1..20}; do
|
||||||
|
echo "Recording sample $i. Say 'hey salty'..."
|
||||||
|
arecord -r 16000 -f S16_LE -c 1 "hey_salty_${i}.wav"
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Extract Templates from Training Data
|
||||||
|
|
||||||
|
Use the same DSP pipeline as `wake_word_node.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import numpy as np
|
||||||
|
from wake_word_node import compute_log_mel
|
||||||
|
|
||||||
|
samples = []
|
||||||
|
for wav_file in glob("hey_salty_*.wav"):
|
||||||
|
sr, data = scipy.io.wavfile.read(wav_file)
|
||||||
|
# Resample to 16kHz if needed
|
||||||
|
float_data = data / 32768.0 # convert PCM-16 to [-1, 1]
|
||||||
|
log_mel = compute_log_mel(float_data, sr=16000, n_fft=512, n_mels=40)
|
||||||
|
samples.append(log_mel)
|
||||||
|
|
||||||
|
# Pad to same length, average
|
||||||
|
max_len = max(m.shape[1] for m in samples)
|
||||||
|
padded = [np.pad(m, ((0, 0), (0, max_len - m.shape[1])), mode='edge')
|
||||||
|
for m in samples]
|
||||||
|
template = np.mean(padded, axis=0).astype(np.float32)
|
||||||
|
np.save("hey_salty.npy", template)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Test and Tune
|
||||||
|
|
||||||
|
1. Replace the current template with your new one
|
||||||
|
2. Test with `wake_word_node` in real environment
|
||||||
|
3. Adjust `match_threshold` in `wake_word_params.yaml` to find the sweet spot
|
||||||
|
4. Collect false-positive and false-negative cases; add them to training set
|
||||||
|
5. Retrain
|
||||||
|
|
||||||
|
### 4. Version Control
|
||||||
|
|
||||||
|
Once satisfied, replace `models/hey_salty.npy` and commit:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy
|
||||||
|
git commit -m "refactor: hey salty template with real training data (v2)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- `generate_wake_word_template.py` — Script to synthesize and generate template
|
||||||
|
- `hey_salty.npy` — Current template (generated from synthetic speech)
|
||||||
|
- `README.md` — This file
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- `wake_word_node.py` — Wake word detection node (cosine similarity, energy gating)
|
||||||
|
- `wake_word_params.yaml` — Detection parameters
|
||||||
|
- `test_wake_word.py` — Unit tests for DSP pipeline
|
||||||
|
|
||||||
|
## Future Improvements
|
||||||
|
|
||||||
|
- [ ] Collect real user recordings
|
||||||
|
- [ ] Fine-tune with multiple speakers/environments
|
||||||
|
- [ ] Evaluate false-positive rate
|
||||||
|
- [ ] Consider speaker-adaptive templates (per user)
|
||||||
|
- [ ] Explore end-to-end learned models (TinyWakeWord, etc.)
|
||||||
BIN
jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy
Normal file
BIN
jetson/ros2_ws/src/saltybot_social/models/hey_salty.npy
Normal file
Binary file not shown.
@ -0,0 +1,200 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
generate_wake_word_template.py — Generate 'hey salty' wake word template for Issue #393.
|
||||||
|
|
||||||
|
Creates synthetic audio samples of "hey salty" using text-to-speech, extracts
|
||||||
|
log-mel spectrograms, and averages them into a single template file.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 generate_wake_word_template.py --output-dir path/to/models/
|
||||||
|
|
||||||
|
The template is saved as hey_salty.npy (log-mel [n_mels, T] array).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
except ImportError:
|
||||||
|
print("ERROR: numpy not found. Install: pip install numpy")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Copy of DSP functions from wake_word_node.py ────────────────────────────────
|
||||||
|
|
||||||
|
def mel_filterbank(sr: int, n_fft: int, n_mels: int,
|
||||||
|
fmin: float = 80.0, fmax = None) -> np.ndarray:
|
||||||
|
"""Build a triangular mel filterbank matrix [n_mels, n_fft//2+1]."""
|
||||||
|
import math
|
||||||
|
if fmax is None:
|
||||||
|
fmax = sr / 2.0
|
||||||
|
|
||||||
|
def hz_to_mel(hz: float) -> float:
|
||||||
|
return 2595.0 * math.log10(1.0 + hz / 700.0)
|
||||||
|
|
||||||
|
def mel_to_hz(mel: float) -> float:
|
||||||
|
return 700.0 * (10.0 ** (mel / 2595.0) - 1.0)
|
||||||
|
|
||||||
|
mel_lo = hz_to_mel(fmin)
|
||||||
|
mel_hi = hz_to_mel(fmax)
|
||||||
|
mel_pts = np.linspace(mel_lo, mel_hi, n_mels + 2)
|
||||||
|
hz_pts = np.array([mel_to_hz(m) for m in mel_pts])
|
||||||
|
freqs = np.fft.rfftfreq(n_fft, d=1.0 / sr)
|
||||||
|
|
||||||
|
fb = np.zeros((n_mels, len(freqs)), dtype=np.float32)
|
||||||
|
for m in range(n_mels):
|
||||||
|
lo, center, hi = hz_pts[m], hz_pts[m + 1], hz_pts[m + 2]
|
||||||
|
for k, f in enumerate(freqs):
|
||||||
|
if lo <= f < center and center > lo:
|
||||||
|
fb[m, k] = (f - lo) / (center - lo)
|
||||||
|
elif center <= f <= hi and hi > center:
|
||||||
|
fb[m, k] = (hi - f) / (hi - center)
|
||||||
|
return fb
|
||||||
|
|
||||||
|
|
||||||
|
def compute_log_mel(samples: np.ndarray, sr: int,
|
||||||
|
n_fft: int = 512, n_mels: int = 40,
|
||||||
|
hop: int = 256) -> np.ndarray:
|
||||||
|
"""Return log-mel spectrogram [n_mels, T] of *samples* (float32 [-1,1])."""
|
||||||
|
n = len(samples)
|
||||||
|
window = np.hanning(n_fft).astype(np.float32)
|
||||||
|
frames = []
|
||||||
|
for start in range(0, max(n - n_fft + 1, 1), hop):
|
||||||
|
chunk = samples[start:start + n_fft]
|
||||||
|
if len(chunk) < n_fft:
|
||||||
|
chunk = np.pad(chunk, (0, n_fft - len(chunk)))
|
||||||
|
power = np.abs(np.fft.rfft(chunk * window)) ** 2
|
||||||
|
frames.append(power)
|
||||||
|
frames_arr = np.array(frames, dtype=np.float32).T # [bins, T]
|
||||||
|
fb = mel_filterbank(sr, n_fft, n_mels)
|
||||||
|
mel = fb @ frames_arr # [n_mels, T]
|
||||||
|
mel = np.where(mel > 1e-10, mel, 1e-10)
|
||||||
|
return np.log(mel)
|
||||||
|
|
||||||
|
|
||||||
|
# ── TTS + Template Generation ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def generate_synthetic_speech(text: str, num_samples: int = 5) -> list:
|
||||||
|
"""
|
||||||
|
Generate synthetic speech samples of `text` using pyttsx3 or fallback.
|
||||||
|
|
||||||
|
Returns list of float32 numpy arrays (mono, 16kHz).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import pyttsx3
|
||||||
|
engine = pyttsx3.init()
|
||||||
|
engine.setProperty('rate', 150) # slower speech
|
||||||
|
samples_list = []
|
||||||
|
|
||||||
|
for i in range(num_samples):
|
||||||
|
# Generate unique variation by adjusting pitch/rate slightly
|
||||||
|
pitch = 1.0 + (i * 0.05 - 0.1) # ±10% pitch variation
|
||||||
|
engine.setProperty('pitch', max(0.5, min(2.0, pitch)))
|
||||||
|
|
||||||
|
# Save to temporary WAV
|
||||||
|
wav_path = f"/tmp/hey_salty_{i}.wav"
|
||||||
|
engine.save_to_file(text, wav_path)
|
||||||
|
engine.runAndWait()
|
||||||
|
|
||||||
|
# Load WAV and convert to 16kHz if needed
|
||||||
|
try:
|
||||||
|
import scipy.io.wavfile as wavfile
|
||||||
|
sr, data = wavfile.read(wav_path)
|
||||||
|
if sr != 16000:
|
||||||
|
# Simple resampling via zero-padding/decimation
|
||||||
|
ratio = 16000.0 / sr
|
||||||
|
new_len = int(len(data) * ratio)
|
||||||
|
indices = np.linspace(0, len(data) - 1, new_len)
|
||||||
|
data = np.interp(indices, np.arange(len(data)), data.astype(np.float32))
|
||||||
|
# Normalize to [-1, 1]
|
||||||
|
if np.max(np.abs(data)) > 0:
|
||||||
|
data = data / (np.max(np.abs(data)) + 1e-6)
|
||||||
|
samples_list.append(data.astype(np.float32))
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Warning: could not load {wav_path}: {e}")
|
||||||
|
|
||||||
|
if samples_list:
|
||||||
|
return samples_list
|
||||||
|
else:
|
||||||
|
raise Exception("No samples generated")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
print(" pyttsx3 not available; generating synthetic sine-wave approximation...")
|
||||||
|
# Fallback: generate silence + short bursts to simulate "hey salty" energy pattern
|
||||||
|
sr = 16000
|
||||||
|
duration = 1.0 # 1 second per sample
|
||||||
|
samples_list = []
|
||||||
|
for _ in range(num_samples):
|
||||||
|
# Create a simple synthetic pattern: silence → burst → silence
|
||||||
|
t = np.linspace(0, duration, int(sr * duration), dtype=np.float32)
|
||||||
|
# Two "peaks" to mimic syllables "hey" and "salty"
|
||||||
|
sig = np.sin(2 * np.pi * 500 * t) * (np.exp(-((t - 0.3) ** 2) / 0.01))
|
||||||
|
sig += np.sin(2 * np.pi * 400 * t) * (np.exp(-((t - 0.7) ** 2) / 0.02))
|
||||||
|
sig = sig / (np.max(np.abs(sig)) + 1e-6)
|
||||||
|
samples_list.append(sig)
|
||||||
|
return samples_list
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Generate 'hey salty' wake word template for wake_word_node")
|
||||||
|
parser.add_argument("--output-dir", default="jetson/ros2_ws/src/saltybot_social/models/",
|
||||||
|
help="Directory to save hey_salty.npy")
|
||||||
|
parser.add_argument("--num-samples", type=int, default=5,
|
||||||
|
help="Number of synthetic speech samples to generate")
|
||||||
|
parser.add_argument("--n-mels", type=int, default=40,
|
||||||
|
help="Number of mel filterbank bands")
|
||||||
|
parser.add_argument("--n-fft", type=int, default=512,
|
||||||
|
help="FFT size for mel spectrogram")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
print(f"Generating {args.num_samples} synthetic 'hey salty' samples...")
|
||||||
|
samples_list = generate_synthetic_speech("hey salty", args.num_samples)
|
||||||
|
|
||||||
|
if not samples_list:
|
||||||
|
print("ERROR: Failed to generate samples")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f" Generated {len(samples_list)} samples")
|
||||||
|
|
||||||
|
# Extract log-mel features for each sample
|
||||||
|
print("Extracting log-mel spectrograms...")
|
||||||
|
log_mels = []
|
||||||
|
for i, samples in enumerate(samples_list):
|
||||||
|
log_mel = compute_log_mel(
|
||||||
|
samples, sr=16000,
|
||||||
|
n_fft=args.n_fft, n_mels=args.n_mels, hop=256
|
||||||
|
)
|
||||||
|
log_mels.append(log_mel)
|
||||||
|
print(f" Sample {i}: shape {log_mel.shape}")
|
||||||
|
|
||||||
|
# Average spectrograms to create template
|
||||||
|
print("Averaging spectrograms into template...")
|
||||||
|
# Pad to same length
|
||||||
|
max_len = max(m.shape[1] for m in log_mels)
|
||||||
|
padded = []
|
||||||
|
for log_mel in log_mels:
|
||||||
|
if log_mel.shape[1] < max_len:
|
||||||
|
pad_width = ((0, 0), (0, max_len - log_mel.shape[1]))
|
||||||
|
log_mel = np.pad(log_mel, pad_width, mode='edge')
|
||||||
|
padded.append(log_mel)
|
||||||
|
|
||||||
|
template = np.mean(padded, axis=0).astype(np.float32)
|
||||||
|
print(f" Template shape: {template.shape}")
|
||||||
|
|
||||||
|
# Save template
|
||||||
|
output_path = output_dir / "hey_salty.npy"
|
||||||
|
np.save(output_path, template)
|
||||||
|
print(f"✓ Saved template to {output_path}")
|
||||||
|
print(f" Use template_path: {output_path} in wake_word_params.yaml")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user