#!/usr/bin/env bash # convert_models.sh — Convert social-bot ONNX models to TensorRT FP16 engines # # Models converted: # SCRFD-10GF — face detection (InsightFace) # ArcFace-R100 — face recognition (InsightFace) # Whisper-small — STT via faster-whisper (handled by CTranslate2, not TRT) # ECAPA-TDNN — speaker embedding (SpeechBrain export) # Piper TTS — neural TTS (ONNX native, no TRT needed) # # Usage: ./scripts/convert_models.sh [--model all|scrfd|arcface|ecapa] # Requirements: TensorRT 8.6+, trtexec in PATH, ONNX models in /models/onnx/ # # Note: Whisper uses faster-whisper's CTranslate2 which handles its own GPU # optimization. Piper uses ONNX Runtime directly (already optimized). # set -euo pipefail MODEL_DIR="${MODEL_DIR:-/models}" ONNX_DIR="${MODEL_DIR}/onnx" ENGINE_DIR="${MODEL_DIR}/engines" WORKSPACE_MB=4096 TARGET="${1:-all}" mkdir -p "${ENGINE_DIR}" log() { echo "[$(date +%H:%M:%S)] $*"; } err() { echo "[ERROR] $*" >&2; exit 1; } check_trtexec() { command -v trtexec &>/dev/null || \ err "trtexec not found. Install TensorRT or add to PATH." } # ── Download ONNX sources ────────────────────────────────────────────────────── download_models() { log "Downloading ONNX model sources..." # SCRFD-10GF (InsightFace model zoo) SCRFD_ONNX="${ONNX_DIR}/scrfd_10g_bnkps.onnx" if [[ ! -f "${SCRFD_ONNX}" ]]; then log "Downloading SCRFD-10GF..." wget -q --show-progress -O "${SCRFD_ONNX}" \ "https://github.com/deepinsight/insightface/releases/download/v0.7/scrfd_10g_bnkps.onnx" || \ python3 -c " import insightface, shutil, os app = insightface.app.FaceAnalysis(providers=['CUDAExecutionProvider']) app.prepare(ctx_id=0) src = os.path.expanduser('~/.insightface/models/buffalo_l/det_10g.onnx') if os.path.exists(src): shutil.copy(src, '${SCRFD_ONNX}') " fi # ArcFace-R100 (InsightFace buffalo_l) ARCFACE_ONNX="${ONNX_DIR}/arcface_r100.onnx" if [[ ! -f "${ARCFACE_ONNX}" ]]; then log "Downloading ArcFace-R100 via insightface..." python3 -c " import insightface, shutil, os app = insightface.app.FaceAnalysis(providers=['CUDAExecutionProvider']) app.prepare(ctx_id=0) src = os.path.expanduser('~/.insightface/models/buffalo_l/w600k_r50.onnx') if os.path.exists(src): shutil.copy(src, '${ARCFACE_ONNX}') " fi # ECAPA-TDNN speaker embedding (SpeechBrain pretrained) ECAPA_ONNX="${ONNX_DIR}/ecapa_tdnn.onnx" if [[ ! -f "${ECAPA_ONNX}" ]]; then log "Exporting ECAPA-TDNN to ONNX..." python3 - <<'PYEOF' import torch from speechbrain.pretrained import EncoderClassifier import os model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir="/models/speechbrain_ecapa" ) model.eval() dummy = torch.randn(1, 16000) # 1s of audio with torch.no_grad(): torch.onnx.export( model.encode_batch.__self__, dummy, "/models/onnx/ecapa_tdnn.onnx", opset_version=12, input_names=["waveform"], output_names=["embedding"], dynamic_axes={"waveform": {1: "audio_len"}} ) print("ECAPA-TDNN exported to /models/onnx/ecapa_tdnn.onnx") PYEOF fi log "ONNX models ready in ${ONNX_DIR}" } # ── SCRFD face detection ─────────────────────────────────────────────────────── convert_scrfd() { local ONNX="${ONNX_DIR}/scrfd_10g_bnkps.onnx" local ENGINE="${ENGINE_DIR}/scrfd_10g_fp16.engine" [[ -f "${ONNX}" ]] || err "SCRFD ONNX not found at ${ONNX}. Run with --download first." log "Converting SCRFD-10GF → TRT FP16..." trtexec \ --onnx="${ONNX}" \ --saveEngine="${ENGINE}" \ --fp16 \ --workspace=${WORKSPACE_MB} \ --minShapes=input.1:1x3x640x640 \ --optShapes=input.1:1x3x640x640 \ --maxShapes=input.1:4x3x640x640 \ --explicitBatch \ 2>&1 | tail -5 log "SCRFD engine saved: ${ENGINE}" benchmark_engine "${ENGINE}" "input.1:1x3x640x640" } # ── ArcFace recognition ──────────────────────────────────────────────────────── convert_arcface() { local ONNX="${ONNX_DIR}/arcface_r100.onnx" local ENGINE="${ENGINE_DIR}/arcface_r100_fp16.engine" [[ -f "${ONNX}" ]] || err "ArcFace ONNX not found at ${ONNX}." log "Converting ArcFace-R100 → TRT FP16..." trtexec \ --onnx="${ONNX}" \ --saveEngine="${ENGINE}" \ --fp16 \ --workspace=${WORKSPACE_MB} \ --minShapes=input.1:1x3x112x112 \ --optShapes=input.1:4x3x112x112 \ --maxShapes=input.1:16x3x112x112 \ --explicitBatch \ 2>&1 | tail -5 log "ArcFace engine saved: ${ENGINE}" benchmark_engine "${ENGINE}" "input.1:4x3x112x112" } # ── ECAPA-TDNN speaker embedding ─────────────────────────────────────────────── convert_ecapa() { local ONNX="${ONNX_DIR}/ecapa_tdnn.onnx" local ENGINE="${ENGINE_DIR}/ecapa_tdnn_fp16.engine" [[ -f "${ONNX}" ]] || err "ECAPA-TDNN ONNX not found at ${ONNX}." log "Converting ECAPA-TDNN → TRT FP16..." trtexec \ --onnx="${ONNX}" \ --saveEngine="${ENGINE}" \ --fp16 \ --workspace=${WORKSPACE_MB} \ --minShapes=waveform:1x8000 \ --optShapes=waveform:1x16000 \ --maxShapes=waveform:1x32000 \ --explicitBatch \ 2>&1 | tail -5 log "ECAPA-TDNN engine saved: ${ENGINE}" benchmark_engine "${ENGINE}" "waveform:1x16000" } # ── Whisper STT (CTranslate2 — no TRT conversion needed) ───────────────────── setup_whisper() { log "Setting up faster-whisper (CTranslate2 format)..." local WHISPER_DIR="/models/whisper-small-ct2" if [[ ! -d "${WHISPER_DIR}" ]]; then python3 - <<'PYEOF' from faster_whisper import WhisperModel import os # Download and cache the small model in CTranslate2 int8 format # This is already GPU-optimized by CTranslate2 — no TRT conversion needed model = WhisperModel("small", device="cuda", compute_type="float16", download_root="/models") print("faster-whisper 'small' model ready at /models/") # Run one warmup inference import numpy as np audio = np.zeros(16000, dtype=np.float32) segments, _ = model.transcribe(audio) list(segments) # consume generator print("Whisper warmup complete") PYEOF else log "Whisper already downloaded, skipping." fi } # ── Piper TTS (ONNX Runtime — no TRT conversion needed) ────────────────────── setup_piper() { log "Setting up Piper TTS voice models..." local PIPER_DIR="${MODEL_DIR}/piper" mkdir -p "${PIPER_DIR}" # Download en_US-lessac-medium (natural, good quality, ~60ms on Orin) if [[ ! -f "${PIPER_DIR}/en_US-lessac-medium.onnx" ]]; then python3 - <<'PYEOF' import urllib.request, os base = "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/" piper_dir = "/models/piper" for fname in ["en_US-lessac-medium.onnx", "en_US-lessac-medium.onnx.json"]: dest = os.path.join(piper_dir, fname) if not os.path.exists(dest): print(f"Downloading {fname}...") urllib.request.urlretrieve(base + fname, dest) print(f" → {dest}") print("Piper voice ready.") PYEOF else log "Piper voice already downloaded, skipping." fi } # ── Benchmark helper ─────────────────────────────────────────────────────────── benchmark_engine() { local ENGINE="$1" local SHAPE="$2" log "Benchmarking ${ENGINE##*/}..." trtexec \ --loadEngine="${ENGINE}" \ --shapes="${SHAPE}" \ --warmUp=3 \ --avgRuns=20 \ --useSpinWait \ 2>&1 | grep -E "mean|median|throughput|Latency" | head -5 } # ── Full benchmark suite ─────────────────────────────────────────────────────── run_benchmarks() { log "=== Social-bot model benchmark suite ===" log "Platform: $(cat /proc/device-tree/model 2>/dev/null || echo 'Orin')" log "TensorRT: $(python3 -c 'import tensorrt as trt; print(trt.__version__)' 2>/dev/null)" log "CUDA: $(nvcc --version 2>/dev/null | grep release | awk '{print $6}' | tr -d ,)" echo for ENGINE in "${ENGINE_DIR}"/*.engine; do [[ -f "${ENGINE}" ]] || continue NAME="${ENGINE##*/}" case "${NAME}" in scrfd*) SHAPE="input.1:1x3x640x640" ;; arcface*)SHAPE="input.1:4x3x112x112" ;; ecapa*) SHAPE="waveform:1x16000" ;; *) SHAPE="" ;; esac [[ -z "${SHAPE}" ]] && continue echo "── ${NAME} ──" benchmark_engine "${ENGINE}" "${SHAPE}" echo done log "Whisper (CTranslate2 GPU):" python3 - <<'PYEOF' import time, numpy as np from faster_whisper import WhisperModel model = WhisperModel("small", device="cuda", compute_type="float16", download_root="/models") audio = np.random.randn(16000).astype(np.float32) # warmup for _ in range(3): list(model.transcribe(audio)[0]) # benchmark times = [] for _ in range(10): t = time.perf_counter() list(model.transcribe(audio)[0]) times.append((time.perf_counter() - t) * 1000) print(f" 1s audio: mean={sum(times)/len(times):.1f}ms, min={min(times):.1f}ms") PYEOF log "Piper TTS (ONNX Runtime):" python3 - <<'PYEOF' import time from piper import PiperVoice voice = PiperVoice.load("/models/piper/en_US-lessac-medium.onnx") text = "Hello, I am Salty, your personal robot companion." # warmup for _ in range(3): list(voice.synthesize_stream_raw(text)) # benchmark times = [] for _ in range(10): t = time.perf_counter() audio_bytes = b"".join(voice.synthesize_stream_raw(text)) times.append((time.perf_counter() - t) * 1000) print(f" '{text[:30]}...': mean={sum(times)/len(times):.1f}ms, min={min(times):.1f}ms") PYEOF } # ── tegrastats snapshot ──────────────────────────────────────────────────────── check_gpu_health() { log "GPU / system health check:" tegrastats --interval 1000 & TPID=$! sleep 2 kill ${TPID} 2>/dev/null echo log "GPU memory:" nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu \ --format=csv,noheader,nounits 2>/dev/null || \ python3 -c "import pycuda.driver as drv; drv.init(); d=drv.Device(0); c=d.make_context(); print('Free:', drv.mem_get_info()[0]//1024//1024, 'MB'); c.pop()" } # ── Main ─────────────────────────────────────────────────────────────────────── check_trtexec case "${TARGET}" in all) download_models convert_scrfd convert_arcface convert_ecapa setup_whisper setup_piper check_gpu_health run_benchmarks ;; scrfd) download_models; convert_scrfd ;; arcface) download_models; convert_arcface ;; ecapa) download_models; convert_ecapa ;; whisper) setup_whisper ;; piper) setup_piper ;; benchmark)run_benchmarks ;; health) check_gpu_health ;; download) download_models ;; *) echo "Usage: $0 [all|scrfd|arcface|ecapa|whisper|piper|benchmark|health|download]" exit 1 ;; esac log "Done."