Merge pull request 'feat: Phone voice command interface (Issue #553)' (#554) from sl-android/issue-553-voice-command into main
This commit is contained in:
commit
80e3b23aec
448
phone/voice_commander.py
Normal file
448
phone/voice_commander.py
Normal file
@ -0,0 +1,448 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
voice_commander.py — Phone-based voice command interface for SaltyBot (Issue #553)
|
||||||
|
|
||||||
|
Runs on Android/Termux. Listens for the wake word 'Hey Salty', transcribes speech
|
||||||
|
via OpenAI Whisper (local), parses robot commands, and publishes to ROS2 topic
|
||||||
|
/saltybot/voice/cmd via WebSocket bridge to Jetson Orin. Confirms commands via
|
||||||
|
termux-tts-speak.
|
||||||
|
|
||||||
|
Supported commands:
|
||||||
|
go forward / go back / go left / go right
|
||||||
|
stop / halt
|
||||||
|
follow me
|
||||||
|
go home
|
||||||
|
look at me
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 phone/voice_commander.py [OPTIONS]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--host HOST Jetson IP or hostname (default: 192.168.1.100)
|
||||||
|
--port PORT rosbridge WebSocket port (default: 9090)
|
||||||
|
--model MODEL Whisper model size: tiny/base/small (default: base)
|
||||||
|
--threshold FLOAT Wake word match threshold 0.0-1.0 (default: 0.6)
|
||||||
|
--record-sec FLOAT Seconds to record after wake word (default: 3.0)
|
||||||
|
--no-tts Disable TTS confirmation
|
||||||
|
--debug Verbose logging
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# ── Optional ROS2 ────────────────────────────────────────────────────────────
|
||||||
|
try:
|
||||||
|
import rclpy
|
||||||
|
from rclpy.node import Node
|
||||||
|
from std_msgs.msg import String
|
||||||
|
ROS2_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
ROS2_AVAILABLE = False
|
||||||
|
|
||||||
|
# ── Whisper (local STT) ───────────────────────────────────────────────────────
|
||||||
|
try:
|
||||||
|
import whisper
|
||||||
|
WHISPER_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
WHISPER_AVAILABLE = False
|
||||||
|
|
||||||
|
# ── WebSocket client ──────────────────────────────────────────────────────────
|
||||||
|
try:
|
||||||
|
import websocket # websocket-client
|
||||||
|
WS_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
WS_AVAILABLE = False
|
||||||
|
|
||||||
|
# ── Constants ────────────────────────────────────────────────────────────────
|
||||||
|
WAKE_WORD = "hey salty"
|
||||||
|
VOICE_CMD_TOPIC = "/saltybot/voice/cmd"
|
||||||
|
SAMPLE_RATE = 16000 # Hz required by Whisper
|
||||||
|
WAKE_RECORD_SEC = 1.5 # short clip to check for wake word
|
||||||
|
CMD_RECORD_SEC = 3.0 # command clip length after wake word
|
||||||
|
RECONNECT_DELAY = 5.0 # seconds between WS reconnects
|
||||||
|
CHUNK_BYTES = 4096
|
||||||
|
|
||||||
|
|
||||||
|
class Command(Enum):
|
||||||
|
GO_FORWARD = "go_forward"
|
||||||
|
GO_BACK = "go_back"
|
||||||
|
GO_LEFT = "go_left"
|
||||||
|
GO_RIGHT = "go_right"
|
||||||
|
STOP = "stop"
|
||||||
|
FOLLOW_ME = "follow_me"
|
||||||
|
GO_HOME = "go_home"
|
||||||
|
LOOK_AT_ME = "look_at_me"
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Command parsing ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Each entry: (list_of_trigger_phrases, Command)
|
||||||
|
COMMAND_TABLE = [
|
||||||
|
(["go forward", "move forward", "forward", "ahead", "go straight"], Command.GO_FORWARD),
|
||||||
|
(["go back", "go backward", "move back", "reverse", "back up"], Command.GO_BACK),
|
||||||
|
(["go left", "turn left", "move left", "left"], Command.GO_LEFT),
|
||||||
|
(["go right", "turn right", "move right", "right"], Command.GO_RIGHT),
|
||||||
|
(["stop", "halt", "freeze", "stay", "stand by"], Command.STOP),
|
||||||
|
(["follow me", "come here", "come with me", "follow"], Command.FOLLOW_ME),
|
||||||
|
(["go home", "return home", "return to base", "dock"], Command.GO_HOME),
|
||||||
|
(["look at me", "face me", "look here", "turn to me"], Command.LOOK_AT_ME),
|
||||||
|
]
|
||||||
|
|
||||||
|
TTS_CONFIRMATIONS = {
|
||||||
|
Command.GO_FORWARD: "Going forward",
|
||||||
|
Command.GO_BACK: "Going back",
|
||||||
|
Command.GO_LEFT: "Turning left",
|
||||||
|
Command.GO_RIGHT: "Turning right",
|
||||||
|
Command.STOP: "Stopping",
|
||||||
|
Command.FOLLOW_ME: "Following you",
|
||||||
|
Command.GO_HOME: "Heading home",
|
||||||
|
Command.LOOK_AT_ME: "Looking at you",
|
||||||
|
Command.UNKNOWN: "Sorry, I didn't understand that",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_command(text: str) -> Command:
|
||||||
|
"""Match transcribed text against command table. Returns best match or UNKNOWN."""
|
||||||
|
text = text.lower().strip()
|
||||||
|
for phrases, cmd in COMMAND_TABLE:
|
||||||
|
for phrase in phrases:
|
||||||
|
if phrase in text:
|
||||||
|
return cmd
|
||||||
|
return Command.UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def contains_wake_word(text: str, threshold: float = 0.6) -> bool:
|
||||||
|
"""Check if transcribed text contains the wake word (fuzzy match)."""
|
||||||
|
text = text.lower().strip()
|
||||||
|
if WAKE_WORD in text:
|
||||||
|
return True
|
||||||
|
# Simple token overlap fallback
|
||||||
|
wake_tokens = set(WAKE_WORD.split())
|
||||||
|
text_tokens = set(text.split())
|
||||||
|
overlap = len(wake_tokens & text_tokens) / len(wake_tokens)
|
||||||
|
return overlap >= threshold
|
||||||
|
|
||||||
|
|
||||||
|
# ── Audio capture via termux-microphone-record ────────────────────────────────
|
||||||
|
|
||||||
|
def record_audio(duration_sec: float, output_path: str) -> bool:
|
||||||
|
"""
|
||||||
|
Record audio using termux-microphone-record.
|
||||||
|
Saves a 16 kHz mono WAV to output_path.
|
||||||
|
Returns True on success.
|
||||||
|
"""
|
||||||
|
# termux-microphone-record writes to a file; we start, wait, then stop.
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"termux-microphone-record",
|
||||||
|
"-l", str(int(duration_sec)), # duration in seconds
|
||||||
|
"-r", str(SAMPLE_RATE), # sample rate
|
||||||
|
"-c", "1", # mono
|
||||||
|
"-e", "aac", # encoding (aac is reliable on Android)
|
||||||
|
"-f", output_path,
|
||||||
|
],
|
||||||
|
check=True,
|
||||||
|
timeout=duration_sec + 5,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
return Path(output_path).exists()
|
||||||
|
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError) as e:
|
||||||
|
logging.debug("termux-microphone-record error: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def tts_speak(text: str) -> None:
|
||||||
|
"""Speak text via termux-tts-speak (non-blocking)."""
|
||||||
|
try:
|
||||||
|
subprocess.Popen(
|
||||||
|
["termux-tts-speak", text],
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.debug("termux-tts-speak not available")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Whisper STT ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class WhisperSTT:
|
||||||
|
"""Thin wrapper around local Whisper model."""
|
||||||
|
|
||||||
|
def __init__(self, model_size: str = "base"):
|
||||||
|
if not WHISPER_AVAILABLE:
|
||||||
|
raise RuntimeError(
|
||||||
|
"openai-whisper not installed. Run: pip install openai-whisper"
|
||||||
|
)
|
||||||
|
logging.info("Loading Whisper model '%s'...", model_size)
|
||||||
|
self.model = whisper.load_model(model_size)
|
||||||
|
logging.info("Whisper model loaded.")
|
||||||
|
|
||||||
|
def transcribe(self, audio_path: str) -> str:
|
||||||
|
"""Transcribe audio file, return lowercase text."""
|
||||||
|
try:
|
||||||
|
result = self.model.transcribe(audio_path, language="en", fp16=False)
|
||||||
|
text = result.get("text", "").strip()
|
||||||
|
logging.debug("Whisper transcription: '%s'", text)
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("Whisper transcription failed: %s", e)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Publisher backends ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class ROS2Publisher:
|
||||||
|
"""Publish voice commands as std_msgs/String on /saltybot/voice/cmd."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
rclpy.init()
|
||||||
|
self._node = Node("voice_commander")
|
||||||
|
self._pub = self._node.create_publisher(String, VOICE_CMD_TOPIC, 10)
|
||||||
|
self._spin_thread = threading.Thread(
|
||||||
|
target=lambda: rclpy.spin(self._node), daemon=True
|
||||||
|
)
|
||||||
|
self._spin_thread.start()
|
||||||
|
|
||||||
|
def publish(self, cmd: Command, raw_text: str) -> None:
|
||||||
|
payload = json.dumps({"command": cmd.value, "raw": raw_text, "ts": time.time()})
|
||||||
|
msg = String()
|
||||||
|
msg.data = payload
|
||||||
|
self._pub.publish(msg)
|
||||||
|
logging.info("ROS2 published: %s", payload)
|
||||||
|
|
||||||
|
def shutdown(self) -> None:
|
||||||
|
self._node.destroy_node()
|
||||||
|
rclpy.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
class WebSocketPublisher:
|
||||||
|
"""Publish voice commands over rosbridge WebSocket protocol."""
|
||||||
|
|
||||||
|
def __init__(self, host: str, port: int):
|
||||||
|
self.uri = f"ws://{host}:{port}"
|
||||||
|
self._ws: Optional[websocket.WebSocket] = None
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._connect()
|
||||||
|
|
||||||
|
def _connect(self) -> None:
|
||||||
|
try:
|
||||||
|
ws = websocket.WebSocket()
|
||||||
|
ws.connect(self.uri, timeout=5)
|
||||||
|
with self._lock:
|
||||||
|
self._ws = ws
|
||||||
|
logging.info("WebSocket connected to %s", self.uri)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("WebSocket connect failed (%s): %s", self.uri, e)
|
||||||
|
self._ws = None
|
||||||
|
|
||||||
|
def _ensure_connected(self) -> bool:
|
||||||
|
if self._ws is not None:
|
||||||
|
return True
|
||||||
|
self._connect()
|
||||||
|
return self._ws is not None
|
||||||
|
|
||||||
|
def publish(self, cmd: Command, raw_text: str) -> None:
|
||||||
|
payload = json.dumps({"command": cmd.value, "raw": raw_text, "ts": time.time()})
|
||||||
|
# rosbridge advertise + publish message
|
||||||
|
advertise_msg = json.dumps({
|
||||||
|
"op": "advertise",
|
||||||
|
"topic": VOICE_CMD_TOPIC,
|
||||||
|
"type": "std_msgs/String",
|
||||||
|
})
|
||||||
|
publish_msg = json.dumps({
|
||||||
|
"op": "publish",
|
||||||
|
"topic": VOICE_CMD_TOPIC,
|
||||||
|
"msg": {"data": payload},
|
||||||
|
})
|
||||||
|
with self._lock:
|
||||||
|
if not self._ensure_connected():
|
||||||
|
logging.error("Cannot publish — WebSocket not connected.")
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self._ws.send(advertise_msg)
|
||||||
|
self._ws.send(publish_msg)
|
||||||
|
logging.info("WS published: %s", payload)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("WebSocket send failed: %s", e)
|
||||||
|
self._ws = None
|
||||||
|
|
||||||
|
def shutdown(self) -> None:
|
||||||
|
with self._lock:
|
||||||
|
if self._ws:
|
||||||
|
try:
|
||||||
|
self._ws.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self._ws = None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main listener loop ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VoiceCommanderConfig:
|
||||||
|
host: str = "192.168.1.100"
|
||||||
|
port: int = 9090
|
||||||
|
model: str = "base"
|
||||||
|
wake_threshold: float = 0.6
|
||||||
|
record_sec: float = CMD_RECORD_SEC
|
||||||
|
no_tts: bool = False
|
||||||
|
debug: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceCommander:
|
||||||
|
"""
|
||||||
|
Main voice command loop.
|
||||||
|
|
||||||
|
State machine:
|
||||||
|
IDLE → record short clip → check for wake word
|
||||||
|
WAKE_DETECTED → record command clip → transcribe → parse → publish → confirm
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: VoiceCommanderConfig):
|
||||||
|
self.cfg = config
|
||||||
|
self._running = False
|
||||||
|
|
||||||
|
# STT
|
||||||
|
self._stt = WhisperSTT(model_size=config.model)
|
||||||
|
|
||||||
|
# Publisher
|
||||||
|
if ROS2_AVAILABLE:
|
||||||
|
logging.info("Using ROS2 publisher backend.")
|
||||||
|
self._pub: ROS2Publisher | WebSocketPublisher = ROS2Publisher()
|
||||||
|
elif WS_AVAILABLE:
|
||||||
|
logging.info("Using WebSocket publisher backend (%s:%d).", config.host, config.port)
|
||||||
|
self._pub = WebSocketPublisher(config.host, config.port)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
"No publisher backend available. "
|
||||||
|
"Install rclpy (ROS2) or websocket-client: pip install websocket-client"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── lifecycle ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
self._running = True
|
||||||
|
logging.info("Voice commander started. Listening for '%s'...", WAKE_WORD)
|
||||||
|
if not self.cfg.no_tts:
|
||||||
|
tts_speak("Hey Salty is ready")
|
||||||
|
try:
|
||||||
|
self._listen_loop()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logging.info("Interrupted.")
|
||||||
|
finally:
|
||||||
|
self.stop()
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._running = False
|
||||||
|
self._pub.shutdown()
|
||||||
|
logging.info("Voice commander stopped.")
|
||||||
|
|
||||||
|
# ── main loop ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _listen_loop(self) -> None:
|
||||||
|
"""Continuously poll for wake word then capture command."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
wake_audio = os.path.join(tmpdir, "wake.aac")
|
||||||
|
cmd_audio = os.path.join(tmpdir, "cmd.aac")
|
||||||
|
|
||||||
|
while self._running:
|
||||||
|
# 1. Record short clip for wake word detection
|
||||||
|
logging.debug("Recording %.1fs for wake word...", WAKE_RECORD_SEC)
|
||||||
|
if not record_audio(WAKE_RECORD_SEC, wake_audio):
|
||||||
|
logging.debug("Wake clip recording failed, retrying.")
|
||||||
|
time.sleep(0.5)
|
||||||
|
continue
|
||||||
|
|
||||||
|
wake_text = self._stt.transcribe(wake_audio)
|
||||||
|
if not wake_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not contains_wake_word(wake_text, self.cfg.wake_threshold):
|
||||||
|
logging.debug("No wake word in: '%s'", wake_text)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2. Wake word detected — acknowledge and record command
|
||||||
|
logging.info("Wake word detected! Recording command...")
|
||||||
|
if not self.cfg.no_tts:
|
||||||
|
tts_speak("Yes?")
|
||||||
|
|
||||||
|
if not record_audio(self.cfg.record_sec, cmd_audio):
|
||||||
|
logging.warning("Command clip recording failed.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
cmd_text = self._stt.transcribe(cmd_audio)
|
||||||
|
if not cmd_text:
|
||||||
|
logging.info("No speech detected after wake word.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 3. Parse and dispatch
|
||||||
|
cmd = parse_command(cmd_text)
|
||||||
|
logging.info("Parsed command: %s (from '%s')", cmd.value, cmd_text)
|
||||||
|
|
||||||
|
self._pub.publish(cmd, cmd_text)
|
||||||
|
|
||||||
|
# 4. TTS confirmation
|
||||||
|
if not self.cfg.no_tts:
|
||||||
|
tts_speak(TTS_CONFIRMATIONS[cmd])
|
||||||
|
|
||||||
|
|
||||||
|
# ── Entry point ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="SaltyBot phone voice command interface (Issue #553)"
|
||||||
|
)
|
||||||
|
parser.add_argument("--host", default="192.168.1.100",
|
||||||
|
help="Jetson IP/hostname (default: 192.168.1.100)")
|
||||||
|
parser.add_argument("--port", type=int, default=9090,
|
||||||
|
help="rosbridge WebSocket port (default: 9090)")
|
||||||
|
parser.add_argument("--model", default="base",
|
||||||
|
choices=["tiny", "base", "small"],
|
||||||
|
help="Whisper model size (default: base)")
|
||||||
|
parser.add_argument("--threshold", type=float, default=0.6,
|
||||||
|
help="Wake word match threshold 0.0-1.0 (default: 0.6)")
|
||||||
|
parser.add_argument("--record-sec", type=float, default=CMD_RECORD_SEC,
|
||||||
|
help=f"Seconds to record command (default: {CMD_RECORD_SEC})")
|
||||||
|
parser.add_argument("--no-tts", action="store_true",
|
||||||
|
help="Disable TTS confirmation")
|
||||||
|
parser.add_argument("--debug", action="store_true",
|
||||||
|
help="Verbose logging")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.DEBUG if args.debug else logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not WHISPER_AVAILABLE:
|
||||||
|
logging.error("openai-whisper not installed. Run: pip install openai-whisper")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
cfg = VoiceCommanderConfig(
|
||||||
|
host=args.host,
|
||||||
|
port=args.port,
|
||||||
|
model=args.model,
|
||||||
|
wake_threshold=args.threshold,
|
||||||
|
record_sec=args.record_sec,
|
||||||
|
no_tts=args.no_tts,
|
||||||
|
debug=args.debug,
|
||||||
|
)
|
||||||
|
|
||||||
|
VoiceCommander(cfg).start()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user