diff --git a/jetson/ros2_ws/src/saltybot_audio_pipeline/config/audio_pipeline_params.yaml b/jetson/ros2_ws/src/saltybot_audio_pipeline/config/audio_pipeline_params.yaml index ad72268..76b1b7f 100644 --- a/jetson/ros2_ws/src/saltybot_audio_pipeline/config/audio_pipeline_params.yaml +++ b/jetson/ros2_ws/src/saltybot_audio_pipeline/config/audio_pipeline_params.yaml @@ -7,9 +7,11 @@ audio_pipeline_node: wake_word_model: "hey_salty" wake_word_threshold: 0.5 wake_word_timeout_s: 8.0 + stt_backend: "whisper" whisper_model: "small" whisper_compute_type: "float16" whisper_language: "" + vosk_model_path: "/models/vosk/vosk-model-en-us-0.22" tts_voice_path: "/models/piper/en_US-lessac-medium.onnx" tts_sample_rate: 22050 mqtt_enabled: true diff --git a/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_pipeline_node.py b/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_pipeline_node.py index 0cc9cdc..a4f00a4 100644 --- a/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_pipeline_node.py +++ b/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_pipeline_node.py @@ -12,7 +12,7 @@ from rclpy.node import Node from rclpy.qos import QoSProfile from std_msgs.msg import String -from .audio_utils import EnergyVAD, UtteranceSegmenter, AudioBuffer, pcm16_to_float32, float32_to_pcm16, resample_audio +from .audio_utils import EnergyVAD, UtteranceSegmenter, AudioBuffer, VoskSTT, pcm16_to_float32, float32_to_pcm16, resample_audio try: import paho.mqtt.client as mqtt @@ -150,9 +150,11 @@ class AudioPipelineNode(Node): ("wake_word_model", "hey_salty"), ("wake_word_threshold", 0.5), ("wake_word_timeout_s", 8.0), + ("stt_backend", "whisper"), ("whisper_model", "small"), ("whisper_compute_type", "float16"), ("whisper_language", ""), + ("vosk_model_path", "/models/vosk/vosk-model-en-us-0.22"), ("tts_voice_path", "/models/piper/en_US-lessac-medium.onnx"), ("tts_sample_rate", 22050), ("mqtt_enabled", True), @@ -168,9 +170,11 @@ class AudioPipelineNode(Node): self._chunk_size = self.get_parameter("chunk_size").value self._ww_model = self.get_parameter("wake_word_model").value self._ww_thresh = self.get_parameter("wake_word_threshold").value + self._stt_backend = self.get_parameter("stt_backend").value self._whisper_model = self.get_parameter("whisper_model").value self._compute_type = self.get_parameter("whisper_compute_type").value self._whisper_lang = self.get_parameter("whisper_language").value or None + self._vosk_model_path = self.get_parameter("vosk_model_path").value self._tts_voice_path = self.get_parameter("tts_voice_path").value self._tts_rate = self.get_parameter("tts_sample_rate").value mqtt_enabled = self.get_parameter("mqtt_enabled").value @@ -191,6 +195,7 @@ class AudioPipelineNode(Node): self._jabra = JabraAudioDevice(device_name, device_idx) self._oww = None self._whisper = None + self._vosk = None self._tts_voice = None self._mqtt = None @@ -223,16 +228,36 @@ class AudioPipelineNode(Node): except Exception as e: self.get_logger().warn(f"openwakeword failed: {e}") - try: - from faster_whisper import WhisperModel - self._whisper = WhisperModel(self._whisper_model, device="cuda", - compute_type=self._compute_type, download_root="/models") - self.get_logger().info(f"Whisper '{self._whisper_model}' loaded") - except Exception as e: - self.get_logger().error(f"Whisper failed: {e}") - self._set_state(AudioState.ERROR) - self._metrics.error_msg = f"Whisper init: {e}" - return + if self._stt_backend == "vosk": + self._vosk = VoskSTT(self._vosk_model_path, self._sample_rate) + if self._vosk.load(): + self.get_logger().info(f"Vosk STT loaded from '{self._vosk_model_path}'") + else: + self.get_logger().warn("Vosk load failed -- falling back to Whisper") + self._vosk = None + self._stt_backend = "whisper" + + if self._stt_backend != "vosk" or self._vosk is None: + try: + from faster_whisper import WhisperModel + device = "cuda" + try: + import torch + if not torch.cuda.is_available(): + device = "cpu" + self._compute_type = "int8" + except ImportError: + device = "cpu" + self._compute_type = "int8" + self._whisper = WhisperModel(self._whisper_model, device=device, + compute_type=self._compute_type, download_root="/models") + self._stt_backend = "whisper" + self.get_logger().info(f"Whisper '{self._whisper_model}' loaded on {device}") + except Exception as e: + self.get_logger().error(f"Whisper failed: {e}") + self._set_state(AudioState.ERROR) + self._metrics.error_msg = f"Whisper init: {e}" + return try: from piper import PiperVoice @@ -279,7 +304,7 @@ class AudioPipelineNode(Node): args=(utt_samples, duration), daemon=True).start() def _process_utterance(self, audio_samples: list, duration: float) -> None: - if self._whisper is None: + if self._whisper is None and self._vosk is None: self._set_state(AudioState.LISTENING) return self._set_state(AudioState.PROCESSING) @@ -287,8 +312,11 @@ class AudioPipelineNode(Node): try: import numpy as np audio_np = np.array(audio_samples, dtype=np.float32) if isinstance(audio_samples, list) else audio_samples.astype(np.float32) - segments_gen, info = self._whisper.transcribe(audio_np, language=self._whisper_lang, beam_size=3, vad_filter=False) - text = " ".join([seg.text.strip() for seg in segments_gen]).strip() + if self._stt_backend == "vosk" and self._vosk is not None: + text = self._vosk.transcribe(audio_np) + else: + segments_gen, _ = self._whisper.transcribe(audio_np, language=self._whisper_lang, beam_size=3, vad_filter=False) + text = " ".join([seg.text.strip() for seg in segments_gen]).strip() if text: stt_time = (time.time() - t0) * 1000 self._metrics.stt_processing_ms = stt_time @@ -297,7 +325,7 @@ class AudioPipelineNode(Node): msg = String() msg.data = text self._text_pub.publish(msg) - self.get_logger().info(f"STT [{duration:.1f}s, {stt_time:.0f}ms]: '{text}'") + self.get_logger().info(f"STT/{self._stt_backend} [{duration:.1f}s, {stt_time:.0f}ms]: '{text}'") self._process_tts(text) else: self._set_state(AudioState.LISTENING) diff --git a/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_utils.py b/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_utils.py index b00f6ba..1ff9956 100644 --- a/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_utils.py +++ b/jetson/ros2_ws/src/saltybot_audio_pipeline/saltybot_audio_pipeline/audio_utils.py @@ -131,3 +131,43 @@ def resample_audio(samples: np.ndarray, orig_rate: int, target_rate: int) -> np. def calculate_rms_db(samples: np.ndarray) -> float: rms = np.sqrt(np.mean(samples ** 2)) return 20 * np.log10(rms + 1e-10) + + +class VoskSTT: + """Offline STT backend using Vosk (low-latency alternative to Whisper; Issue #503).""" + + def __init__(self, model_path: str = "/models/vosk/vosk-model-en-us-0.22", + sample_rate: int = 16000): + self.model_path = model_path + self.sample_rate = sample_rate + self._recognizer = None + self._model = None + self._json = None + + def load(self) -> bool: + try: + from vosk import Model, KaldiRecognizer + import json as _json + self._json = _json + self._model = Model(self.model_path) + self._recognizer = KaldiRecognizer(self._model, self.sample_rate) + self._recognizer.SetWords(True) + return True + except Exception as e: + print(f"VoskSTT load failed: {e}") + return False + + def transcribe(self, audio_samples) -> str: + if self._recognizer is None: + return "" + try: + import numpy as np + if not isinstance(audio_samples, np.ndarray): + audio_samples = np.array(audio_samples, dtype=np.float32) + pcm = float32_to_pcm16(audio_samples) + self._recognizer.AcceptWaveform(pcm) + result = self._json.loads(self._recognizer.FinalResult()) + return result.get("text", "").strip() + except Exception as e: + print(f"VoskSTT transcribe error: {e}") + return "" diff --git a/jetson/ros2_ws/src/saltybot_audio_pipeline/test/test_audio_pipeline.py b/jetson/ros2_ws/src/saltybot_audio_pipeline/test/test_audio_pipeline.py new file mode 100644 index 0000000..7f697e7 --- /dev/null +++ b/jetson/ros2_ws/src/saltybot_audio_pipeline/test/test_audio_pipeline.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +"""Unit tests for saltybot_audio_pipeline (Issue #503).""" + +import unittest +import numpy as np +import threading + + +class TestEnergyVAD(unittest.TestCase): + def setUp(self): + from saltybot_audio_pipeline.audio_utils import EnergyVAD + self.vad = EnergyVAD(threshold_db=-35.0) + + def test_silence_is_not_speech(self): + self.assertFalse(self.vad.is_speech(np.zeros(512, dtype=np.float32))) + + def test_loud_signal_is_speech(self): + self.assertTrue(self.vad.is_speech(np.ones(512, dtype=np.float32) * 0.8)) + + def test_rms_db_silence(self): + self.assertLess(self.vad.rms_db(np.zeros(512, dtype=np.float32)), -100.0) + + def test_rms_db_full_scale(self): + self.assertAlmostEqual(self.vad.rms_db(np.ones(512, dtype=np.float32)), 0.0, delta=1.0) + + def test_custom_threshold(self): + from saltybot_audio_pipeline.audio_utils import EnergyVAD + vad_strict = EnergyVAD(threshold_db=-10.0) + mid = np.ones(512, dtype=np.float32) * 0.1 + self.assertFalse(vad_strict.is_speech(mid)) + self.assertTrue(self.vad.is_speech(mid)) + + +class TestPCMConversion(unittest.TestCase): + def setUp(self): + from saltybot_audio_pipeline.audio_utils import pcm16_to_float32, float32_to_pcm16 + self.to_f32 = pcm16_to_float32 + self.to_pcm = float32_to_pcm16 + + def test_roundtrip(self): + original = np.array([0.0, 0.5, -0.5, 1.0, -1.0], dtype=np.float32) + restored = self.to_f32(self.to_pcm(original)) + np.testing.assert_allclose(original, restored, atol=1.5e-4) + + def test_zero_samples(self): + zeros = np.zeros(100, dtype=np.float32) + np.testing.assert_allclose(self.to_f32(self.to_pcm(zeros)), 0.0, atol=1e-4) + + def test_clipping(self): + overflow = np.array([2.0, -2.0], dtype=np.float32) + restored = self.to_f32(self.to_pcm(overflow)) + self.assertTrue(all(abs(v) <= 1.0 + 1e-4 for v in restored)) + + def test_list_input(self): + self.assertIsInstance(self.to_pcm([0.1, -0.1, 0.5]), bytes) + + def test_pcm_length(self): + self.assertEqual(len(self.to_pcm(np.zeros(256, dtype=np.float32))), 256 * 2) + + +class TestAudioBuffer(unittest.TestCase): + def setUp(self): + from saltybot_audio_pipeline.audio_utils import AudioBuffer + self.buf = AudioBuffer(capacity_s=1.0, sample_rate=16000) + + def test_push_and_size(self): + self.buf.push(np.zeros(512, dtype=np.float32)) + self.assertEqual(self.buf.size(), 512) + + def test_extract_all(self): + samples = np.ones(512, dtype=np.float32) * 0.5 + self.buf.push(samples) + np.testing.assert_allclose(self.buf.extract(), 0.5) + + def test_extract_duration(self): + self.buf.push(np.zeros(16000, dtype=np.float32)) + self.assertEqual(len(self.buf.extract(duration_s=0.5)), 8000) + + def test_clear(self): + self.buf.push(np.zeros(512, dtype=np.float32)) + self.buf.clear() + self.assertEqual(self.buf.size(), 0) + + def test_capacity_limit(self): + self.buf.push(np.zeros(32000, dtype=np.float32)) + self.assertLessEqual(self.buf.size(), 16000) + + def test_thread_safety(self): + errors = [] + def push_loop(): + for _ in range(50): + try: + self.buf.push(np.zeros(100, dtype=np.float32)) + except Exception as e: + errors.append(e) + threads = [threading.Thread(target=push_loop) for _ in range(5)] + for t in threads: t.start() + for t in threads: t.join() + self.assertEqual(errors, []) + + +class TestUtteranceSegmenter(unittest.TestCase): + def setUp(self): + from saltybot_audio_pipeline.audio_utils import EnergyVAD, UtteranceSegmenter + vad = EnergyVAD(threshold_db=-35.0) + self.seg = UtteranceSegmenter(vad, silence_duration_s=0.1, min_duration_s=0.05, sample_rate=16000) + + def test_silence_produces_no_utterance(self): + self.assertEqual(self.seg.push(np.zeros(1600, dtype=np.float32)), []) + + def test_speech_then_silence_produces_utterance(self): + self.seg.push(np.ones(3200, dtype=np.float32) * 0.5) + completed = self.seg.push(np.zeros(4000, dtype=np.float32)) + self.assertEqual(len(completed), 1) + _, duration = completed[0] + self.assertGreater(duration, 0.05) + + def test_reset_clears_buffer(self): + self.seg.push(np.ones(3200, dtype=np.float32) * 0.5) + self.seg.reset() + self.assertEqual(self.seg.push(np.zeros(4000, dtype=np.float32)), []) + + +class TestResampleAudio(unittest.TestCase): + def test_same_rate_is_identity(self): + from saltybot_audio_pipeline.audio_utils import resample_audio + s = np.ones(1000, dtype=np.float32) * 0.5 + np.testing.assert_array_equal(resample_audio(s, 16000, 16000), s) + + def test_upsample_increases_length(self): + from saltybot_audio_pipeline.audio_utils import resample_audio + self.assertGreater(len(resample_audio(np.ones(1000), 16000, 22050)), 1000) + + def test_downsample_decreases_length(self): + from saltybot_audio_pipeline.audio_utils import resample_audio + self.assertLess(len(resample_audio(np.ones(22050), 22050, 16000)), 22050) + + +class TestVoskSTT(unittest.TestCase): + def test_init_bad_path_load_returns_false(self): + from saltybot_audio_pipeline.audio_utils import VoskSTT + vosk = VoskSTT(model_path="/nonexistent/path", sample_rate=16000) + self.assertFalse(vosk.load()) + + def test_transcribe_without_load_returns_empty(self): + from saltybot_audio_pipeline.audio_utils import VoskSTT + vosk = VoskSTT(model_path="/nonexistent/path", sample_rate=16000) + self.assertEqual(vosk.transcribe(np.zeros(1600, dtype=np.float32)), "") + + +class TestJabraAudioDevice(unittest.TestCase): + def test_read_without_open_returns_none(self): + from saltybot_audio_pipeline.audio_pipeline_node import JabraAudioDevice + self.assertIsNone(JabraAudioDevice().read_chunk(512)) + + def test_write_without_open_returns_false(self): + from saltybot_audio_pipeline.audio_pipeline_node import JabraAudioDevice + self.assertFalse(JabraAudioDevice().write_chunk(b"\x00" * 1024)) + + def test_close_without_open_is_safe(self): + from saltybot_audio_pipeline.audio_pipeline_node import JabraAudioDevice + JabraAudioDevice().close() # Must not raise + + +class TestAudioMetrics(unittest.TestCase): + def test_default_values(self): + from saltybot_audio_pipeline.audio_pipeline_node import AudioMetrics + m = AudioMetrics() + self.assertEqual(m.wake_to_stt_ms, 0.0) + self.assertEqual(m.transcribed_text, "") + self.assertEqual(m.speaker_id, "unknown") + + def test_asdict(self): + from dataclasses import asdict + from saltybot_audio_pipeline.audio_pipeline_node import AudioMetrics + d = asdict(AudioMetrics(transcribed_text="hello")) + self.assertEqual(d["transcribed_text"], "hello") + self.assertIn("wake_to_stt_ms", d) + + +class TestAudioState(unittest.TestCase): + def test_state_values(self): + from saltybot_audio_pipeline.audio_pipeline_node import AudioState + self.assertEqual(AudioState.IDLE.value, "idle") + self.assertEqual(AudioState.LISTENING.value, "listening") + self.assertEqual(AudioState.WAKE_WORD_DETECTED.value, "wake_detected") + self.assertEqual(AudioState.PROCESSING.value, "processing") + self.assertEqual(AudioState.SPEAKING.value, "speaking") + self.assertEqual(AudioState.ERROR.value, "error") + + def test_all_six_states(self): + from saltybot_audio_pipeline.audio_pipeline_node import AudioState + self.assertEqual(len(list(AudioState)), 6) + + +if __name__ == "__main__": + unittest.main() diff --git a/jetson/ros2_ws/src/saltybot_bringup/launch/full_stack.launch.py b/jetson/ros2_ws/src/saltybot_bringup/launch/full_stack.launch.py index 038c6d5..19a98b5 100644 --- a/jetson/ros2_ws/src/saltybot_bringup/launch/full_stack.launch.py +++ b/jetson/ros2_ws/src/saltybot_bringup/launch/full_stack.launch.py @@ -62,6 +62,7 @@ Launch sequence (wall-clock delays — conservative for cold start) t= 2s sensors (RPLIDAR + RealSense) t= 4s UWB driver (independent serial device) t= 4s CSI cameras (optional, independent) + t= 5s audio_pipeline (Jabra SPEAK 810: wake word + STT + TTS; Issue #503) t= 6s SLAM / outdoor nav (needs sensors; indoor/outdoor mode only) t= 6s person detection (needs RealSense up) t=14s Nav2 (needs SLAM to have partial map; indoor only) @@ -229,6 +230,19 @@ enable_mission_logging_arg = DeclareLaunchArgument( description="Launch autonomous docking behavior (auto-dock at 20% battery, Issue #489)", ) + enable_audio_pipeline_arg = DeclareLaunchArgument( + "enable_audio_pipeline", + default_value="true", + description="Audio pipeline: Jabra SPEAK 810 + wake word + STT + TTS (Issue #503)", + ) + + audio_stt_backend_arg = DeclareLaunchArgument( + "audio_stt_backend", + default_value="whisper", + choices=["whisper", "vosk"], + description="STT backend: whisper (CUDA) or vosk (CPU, low-latency)", + ) + follow_distance_arg = DeclareLaunchArgument( "follow_distance", default_value="1.5", @@ -379,6 +393,21 @@ enable_mission_logging_arg = DeclareLaunchArgument( ) # ── t=6s SLAM — RTAB-Map (indoor only; needs sensors up for ~4s) ───────── + audio_pipeline = TimerAction( + period=5.0, + actions=[ + GroupAction( + condition=IfCondition(LaunchConfiguration("enable_audio_pipeline")), + actions=[ + LogInfo(msg="[full_stack] Starting audio pipeline (Jabra SPEAK 810)"), + IncludeLaunchDescription( + _launch("saltybot_audio_pipeline", "launch", "audio_pipeline.launch.py"), + ), + ], + ), + ], + ) + slam = TimerAction( period=6.0, actions=[ @@ -542,6 +571,8 @@ enable_mission_logging_arg = DeclareLaunchArgument( enable_rosbridge_arg, enable_mission_logging_arg, enable_docking_arg, + enable_audio_pipeline_arg, + audio_stt_backend_arg, follow_distance_arg, max_linear_vel_arg, uwb_port_a_arg, @@ -569,6 +600,9 @@ enable_mission_logging_arg, csi_cameras, uwb_driver, + # t=5s + audio_pipeline, + # t=6s slam, outdoor_nav,