- Dockerfile.social: social-bot container with faster-whisper, llama-cpp-python (CUDA), piper-tts, insightface, pyannote.audio, OpenWakeWord, pyaudio - scripts/convert_models.sh: TRT FP16 conversion for SCRFD-10GF, ArcFace-R100, ECAPA-TDNN; CTranslate2 setup for Whisper; Piper voice download; benchmark suite - config/asound.conf: ALSA USB mic (card1) + USB speaker (card2) config - models/README.md: version-pinned model table, /models/ layout, perf targets - systemd/: saltybot-social.service + saltybot.target + install_systemd.sh - docker-compose.yml: saltybot-social service with GPU, audio device passthrough, NVMe volume mounts for /models and /social_db Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
340 lines
12 KiB
Bash
340 lines
12 KiB
Bash
#!/usr/bin/env bash
|
|
# convert_models.sh — Convert social-bot ONNX models to TensorRT FP16 engines
|
|
#
|
|
# Models converted:
|
|
# SCRFD-10GF — face detection (InsightFace)
|
|
# ArcFace-R100 — face recognition (InsightFace)
|
|
# Whisper-small — STT via faster-whisper (handled by CTranslate2, not TRT)
|
|
# ECAPA-TDNN — speaker embedding (SpeechBrain export)
|
|
# Piper TTS — neural TTS (ONNX native, no TRT needed)
|
|
#
|
|
# Usage: ./scripts/convert_models.sh [--model all|scrfd|arcface|ecapa]
|
|
# Requirements: TensorRT 8.6+, trtexec in PATH, ONNX models in /models/onnx/
|
|
#
|
|
# Note: Whisper uses faster-whisper's CTranslate2 which handles its own GPU
|
|
# optimization. Piper uses ONNX Runtime directly (already optimized).
|
|
#
|
|
set -euo pipefail
|
|
|
|
MODEL_DIR="${MODEL_DIR:-/models}"
|
|
ONNX_DIR="${MODEL_DIR}/onnx"
|
|
ENGINE_DIR="${MODEL_DIR}/engines"
|
|
WORKSPACE_MB=4096
|
|
TARGET="${1:-all}"
|
|
|
|
mkdir -p "${ENGINE_DIR}"
|
|
|
|
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
|
err() { echo "[ERROR] $*" >&2; exit 1; }
|
|
|
|
check_trtexec() {
|
|
command -v trtexec &>/dev/null || \
|
|
err "trtexec not found. Install TensorRT or add to PATH."
|
|
}
|
|
|
|
# ── Download ONNX sources ──────────────────────────────────────────────────────
|
|
download_models() {
|
|
log "Downloading ONNX model sources..."
|
|
|
|
# SCRFD-10GF (InsightFace model zoo)
|
|
SCRFD_ONNX="${ONNX_DIR}/scrfd_10g_bnkps.onnx"
|
|
if [[ ! -f "${SCRFD_ONNX}" ]]; then
|
|
log "Downloading SCRFD-10GF..."
|
|
wget -q --show-progress -O "${SCRFD_ONNX}" \
|
|
"https://github.com/deepinsight/insightface/releases/download/v0.7/scrfd_10g_bnkps.onnx" || \
|
|
python3 -c "
|
|
import insightface, shutil, os
|
|
app = insightface.app.FaceAnalysis(providers=['CUDAExecutionProvider'])
|
|
app.prepare(ctx_id=0)
|
|
src = os.path.expanduser('~/.insightface/models/buffalo_l/det_10g.onnx')
|
|
if os.path.exists(src): shutil.copy(src, '${SCRFD_ONNX}')
|
|
"
|
|
fi
|
|
|
|
# ArcFace-R100 (InsightFace buffalo_l)
|
|
ARCFACE_ONNX="${ONNX_DIR}/arcface_r100.onnx"
|
|
if [[ ! -f "${ARCFACE_ONNX}" ]]; then
|
|
log "Downloading ArcFace-R100 via insightface..."
|
|
python3 -c "
|
|
import insightface, shutil, os
|
|
app = insightface.app.FaceAnalysis(providers=['CUDAExecutionProvider'])
|
|
app.prepare(ctx_id=0)
|
|
src = os.path.expanduser('~/.insightface/models/buffalo_l/w600k_r50.onnx')
|
|
if os.path.exists(src): shutil.copy(src, '${ARCFACE_ONNX}')
|
|
"
|
|
fi
|
|
|
|
# ECAPA-TDNN speaker embedding (SpeechBrain pretrained)
|
|
ECAPA_ONNX="${ONNX_DIR}/ecapa_tdnn.onnx"
|
|
if [[ ! -f "${ECAPA_ONNX}" ]]; then
|
|
log "Exporting ECAPA-TDNN to ONNX..."
|
|
python3 - <<'PYEOF'
|
|
import torch
|
|
from speechbrain.pretrained import EncoderClassifier
|
|
import os
|
|
|
|
model = EncoderClassifier.from_hparams(
|
|
source="speechbrain/spkrec-ecapa-voxceleb",
|
|
savedir="/models/speechbrain_ecapa"
|
|
)
|
|
model.eval()
|
|
|
|
dummy = torch.randn(1, 16000) # 1s of audio
|
|
with torch.no_grad():
|
|
torch.onnx.export(
|
|
model.encode_batch.__self__,
|
|
dummy,
|
|
"/models/onnx/ecapa_tdnn.onnx",
|
|
opset_version=12,
|
|
input_names=["waveform"],
|
|
output_names=["embedding"],
|
|
dynamic_axes={"waveform": {1: "audio_len"}}
|
|
)
|
|
print("ECAPA-TDNN exported to /models/onnx/ecapa_tdnn.onnx")
|
|
PYEOF
|
|
fi
|
|
|
|
log "ONNX models ready in ${ONNX_DIR}"
|
|
}
|
|
|
|
# ── SCRFD face detection ───────────────────────────────────────────────────────
|
|
convert_scrfd() {
|
|
local ONNX="${ONNX_DIR}/scrfd_10g_bnkps.onnx"
|
|
local ENGINE="${ENGINE_DIR}/scrfd_10g_fp16.engine"
|
|
|
|
[[ -f "${ONNX}" ]] || err "SCRFD ONNX not found at ${ONNX}. Run with --download first."
|
|
|
|
log "Converting SCRFD-10GF → TRT FP16..."
|
|
trtexec \
|
|
--onnx="${ONNX}" \
|
|
--saveEngine="${ENGINE}" \
|
|
--fp16 \
|
|
--workspace=${WORKSPACE_MB} \
|
|
--minShapes=input.1:1x3x640x640 \
|
|
--optShapes=input.1:1x3x640x640 \
|
|
--maxShapes=input.1:4x3x640x640 \
|
|
--explicitBatch \
|
|
2>&1 | tail -5
|
|
|
|
log "SCRFD engine saved: ${ENGINE}"
|
|
benchmark_engine "${ENGINE}" "input.1:1x3x640x640"
|
|
}
|
|
|
|
# ── ArcFace recognition ────────────────────────────────────────────────────────
|
|
convert_arcface() {
|
|
local ONNX="${ONNX_DIR}/arcface_r100.onnx"
|
|
local ENGINE="${ENGINE_DIR}/arcface_r100_fp16.engine"
|
|
|
|
[[ -f "${ONNX}" ]] || err "ArcFace ONNX not found at ${ONNX}."
|
|
|
|
log "Converting ArcFace-R100 → TRT FP16..."
|
|
trtexec \
|
|
--onnx="${ONNX}" \
|
|
--saveEngine="${ENGINE}" \
|
|
--fp16 \
|
|
--workspace=${WORKSPACE_MB} \
|
|
--minShapes=input.1:1x3x112x112 \
|
|
--optShapes=input.1:4x3x112x112 \
|
|
--maxShapes=input.1:16x3x112x112 \
|
|
--explicitBatch \
|
|
2>&1 | tail -5
|
|
|
|
log "ArcFace engine saved: ${ENGINE}"
|
|
benchmark_engine "${ENGINE}" "input.1:4x3x112x112"
|
|
}
|
|
|
|
# ── ECAPA-TDNN speaker embedding ───────────────────────────────────────────────
|
|
convert_ecapa() {
|
|
local ONNX="${ONNX_DIR}/ecapa_tdnn.onnx"
|
|
local ENGINE="${ENGINE_DIR}/ecapa_tdnn_fp16.engine"
|
|
|
|
[[ -f "${ONNX}" ]] || err "ECAPA-TDNN ONNX not found at ${ONNX}."
|
|
|
|
log "Converting ECAPA-TDNN → TRT FP16..."
|
|
trtexec \
|
|
--onnx="${ONNX}" \
|
|
--saveEngine="${ENGINE}" \
|
|
--fp16 \
|
|
--workspace=${WORKSPACE_MB} \
|
|
--minShapes=waveform:1x8000 \
|
|
--optShapes=waveform:1x16000 \
|
|
--maxShapes=waveform:1x32000 \
|
|
--explicitBatch \
|
|
2>&1 | tail -5
|
|
|
|
log "ECAPA-TDNN engine saved: ${ENGINE}"
|
|
benchmark_engine "${ENGINE}" "waveform:1x16000"
|
|
}
|
|
|
|
# ── Whisper STT (CTranslate2 — no TRT conversion needed) ─────────────────────
|
|
setup_whisper() {
|
|
log "Setting up faster-whisper (CTranslate2 format)..."
|
|
local WHISPER_DIR="/models/whisper-small-ct2"
|
|
|
|
if [[ ! -d "${WHISPER_DIR}" ]]; then
|
|
python3 - <<'PYEOF'
|
|
from faster_whisper import WhisperModel
|
|
import os
|
|
|
|
# Download and cache the small model in CTranslate2 int8 format
|
|
# This is already GPU-optimized by CTranslate2 — no TRT conversion needed
|
|
model = WhisperModel("small", device="cuda", compute_type="float16",
|
|
download_root="/models")
|
|
print("faster-whisper 'small' model ready at /models/")
|
|
# Run one warmup inference
|
|
import numpy as np
|
|
audio = np.zeros(16000, dtype=np.float32)
|
|
segments, _ = model.transcribe(audio)
|
|
list(segments) # consume generator
|
|
print("Whisper warmup complete")
|
|
PYEOF
|
|
else
|
|
log "Whisper already downloaded, skipping."
|
|
fi
|
|
}
|
|
|
|
# ── Piper TTS (ONNX Runtime — no TRT conversion needed) ──────────────────────
|
|
setup_piper() {
|
|
log "Setting up Piper TTS voice models..."
|
|
local PIPER_DIR="${MODEL_DIR}/piper"
|
|
mkdir -p "${PIPER_DIR}"
|
|
|
|
# Download en_US-lessac-medium (natural, good quality, ~60ms on Orin)
|
|
if [[ ! -f "${PIPER_DIR}/en_US-lessac-medium.onnx" ]]; then
|
|
python3 - <<'PYEOF'
|
|
import urllib.request, os
|
|
|
|
base = "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/"
|
|
piper_dir = "/models/piper"
|
|
|
|
for fname in ["en_US-lessac-medium.onnx", "en_US-lessac-medium.onnx.json"]:
|
|
dest = os.path.join(piper_dir, fname)
|
|
if not os.path.exists(dest):
|
|
print(f"Downloading {fname}...")
|
|
urllib.request.urlretrieve(base + fname, dest)
|
|
print(f" → {dest}")
|
|
print("Piper voice ready.")
|
|
PYEOF
|
|
else
|
|
log "Piper voice already downloaded, skipping."
|
|
fi
|
|
}
|
|
|
|
# ── Benchmark helper ───────────────────────────────────────────────────────────
|
|
benchmark_engine() {
|
|
local ENGINE="$1"
|
|
local SHAPE="$2"
|
|
log "Benchmarking ${ENGINE##*/}..."
|
|
trtexec \
|
|
--loadEngine="${ENGINE}" \
|
|
--shapes="${SHAPE}" \
|
|
--warmUp=3 \
|
|
--avgRuns=20 \
|
|
--useSpinWait \
|
|
2>&1 | grep -E "mean|median|throughput|Latency" | head -5
|
|
}
|
|
|
|
# ── Full benchmark suite ───────────────────────────────────────────────────────
|
|
run_benchmarks() {
|
|
log "=== Social-bot model benchmark suite ==="
|
|
log "Platform: $(cat /proc/device-tree/model 2>/dev/null || echo 'Orin')"
|
|
log "TensorRT: $(python3 -c 'import tensorrt as trt; print(trt.__version__)' 2>/dev/null)"
|
|
log "CUDA: $(nvcc --version 2>/dev/null | grep release | awk '{print $6}' | tr -d ,)"
|
|
echo
|
|
|
|
for ENGINE in "${ENGINE_DIR}"/*.engine; do
|
|
[[ -f "${ENGINE}" ]] || continue
|
|
NAME="${ENGINE##*/}"
|
|
case "${NAME}" in
|
|
scrfd*) SHAPE="input.1:1x3x640x640" ;;
|
|
arcface*)SHAPE="input.1:4x3x112x112" ;;
|
|
ecapa*) SHAPE="waveform:1x16000" ;;
|
|
*) SHAPE="" ;;
|
|
esac
|
|
[[ -z "${SHAPE}" ]] && continue
|
|
echo "── ${NAME} ──"
|
|
benchmark_engine "${ENGINE}" "${SHAPE}"
|
|
echo
|
|
done
|
|
|
|
log "Whisper (CTranslate2 GPU):"
|
|
python3 - <<'PYEOF'
|
|
import time, numpy as np
|
|
from faster_whisper import WhisperModel
|
|
model = WhisperModel("small", device="cuda", compute_type="float16", download_root="/models")
|
|
audio = np.random.randn(16000).astype(np.float32)
|
|
# warmup
|
|
for _ in range(3):
|
|
list(model.transcribe(audio)[0])
|
|
# benchmark
|
|
times = []
|
|
for _ in range(10):
|
|
t = time.perf_counter()
|
|
list(model.transcribe(audio)[0])
|
|
times.append((time.perf_counter() - t) * 1000)
|
|
print(f" 1s audio: mean={sum(times)/len(times):.1f}ms, min={min(times):.1f}ms")
|
|
PYEOF
|
|
|
|
log "Piper TTS (ONNX Runtime):"
|
|
python3 - <<'PYEOF'
|
|
import time
|
|
from piper import PiperVoice
|
|
voice = PiperVoice.load("/models/piper/en_US-lessac-medium.onnx")
|
|
text = "Hello, I am Salty, your personal robot companion."
|
|
# warmup
|
|
for _ in range(3):
|
|
list(voice.synthesize_stream_raw(text))
|
|
# benchmark
|
|
times = []
|
|
for _ in range(10):
|
|
t = time.perf_counter()
|
|
audio_bytes = b"".join(voice.synthesize_stream_raw(text))
|
|
times.append((time.perf_counter() - t) * 1000)
|
|
print(f" '{text[:30]}...': mean={sum(times)/len(times):.1f}ms, min={min(times):.1f}ms")
|
|
PYEOF
|
|
}
|
|
|
|
# ── tegrastats snapshot ────────────────────────────────────────────────────────
|
|
check_gpu_health() {
|
|
log "GPU / system health check:"
|
|
tegrastats --interval 1000 &
|
|
TPID=$!
|
|
sleep 2
|
|
kill ${TPID} 2>/dev/null
|
|
echo
|
|
log "GPU memory:"
|
|
nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu \
|
|
--format=csv,noheader,nounits 2>/dev/null || \
|
|
python3 -c "import pycuda.driver as drv; drv.init(); d=drv.Device(0); c=d.make_context(); print('Free:', drv.mem_get_info()[0]//1024//1024, 'MB'); c.pop()"
|
|
}
|
|
|
|
# ── Main ───────────────────────────────────────────────────────────────────────
|
|
check_trtexec
|
|
|
|
case "${TARGET}" in
|
|
all)
|
|
download_models
|
|
convert_scrfd
|
|
convert_arcface
|
|
convert_ecapa
|
|
setup_whisper
|
|
setup_piper
|
|
check_gpu_health
|
|
run_benchmarks
|
|
;;
|
|
scrfd) download_models; convert_scrfd ;;
|
|
arcface) download_models; convert_arcface ;;
|
|
ecapa) download_models; convert_ecapa ;;
|
|
whisper) setup_whisper ;;
|
|
piper) setup_piper ;;
|
|
benchmark)run_benchmarks ;;
|
|
health) check_gpu_health ;;
|
|
download) download_models ;;
|
|
*)
|
|
echo "Usage: $0 [all|scrfd|arcface|ecapa|whisper|piper|benchmark|health|download]"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
log "Done."
|