Compare commits
1 Commits
5043578934
...
71d6ce610b
| Author | SHA1 | Date | |
|---|---|---|---|
| 71d6ce610b |
@ -1,119 +0,0 @@
|
|||||||
# Social-bot container — JetPack 6 + TensorRT + audio AI stack
|
|
||||||
# Extends the base ROS2 Humble container with social-bot dependencies.
|
|
||||||
#
|
|
||||||
# Deps: faster-whisper, llama-cpp-python (CUDA), piper-tts, insightface,
|
|
||||||
# pyannote.audio, OpenWakeWord, Silero VAD, pyaudio, sounddevice
|
|
||||||
#
|
|
||||||
# Build: docker build -f Dockerfile.social -t saltybot/social:latest .
|
|
||||||
# Run: docker compose -f docker-compose.yml up -d saltybot-social
|
|
||||||
|
|
||||||
FROM nvcr.io/nvidia/l4t-jetpack:r36.2.0
|
|
||||||
|
|
||||||
LABEL maintainer="sl-jetson"
|
|
||||||
LABEL description="Social-bot AI stack — speech, LLM, TTS, face recognition on Orin Nano Super"
|
|
||||||
LABEL jetpack="6.0"
|
|
||||||
LABEL l4t="r36.2.0"
|
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
ENV ROS_DISTRO=humble
|
|
||||||
ENV ROS_ROOT=/opt/ros/${ROS_DISTRO}
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1
|
|
||||||
ENV PYTHONUNBUFFERED=1
|
|
||||||
|
|
||||||
# ── Locale ────────────────────────────────────────────────────────────────────
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
locales tzdata \
|
|
||||||
&& locale-gen en_US.UTF-8 \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
ENV LANG=en_US.UTF-8
|
|
||||||
|
|
||||||
# ── System deps ───────────────────────────────────────────────────────────────
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
build-essential cmake git wget curl ca-certificates \
|
|
||||||
python3-dev python3-pip python3-setuptools python3-wheel \
|
|
||||||
# Audio hardware
|
|
||||||
alsa-utils libasound2-dev pulseaudio pulseaudio-utils \
|
|
||||||
portaudio19-dev libsndfile1-dev libsoundio-dev \
|
|
||||||
# USB audio / ReSpeaker support
|
|
||||||
usbutils libusb-1.0-0-dev \
|
|
||||||
# CUDA / TensorRT tools
|
|
||||||
cuda-toolkit-12-2 libcudnn8 libcudnn8-dev \
|
|
||||||
# Misc
|
|
||||||
htop tmux nano ffmpeg \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# ── ROS2 Humble ───────────────────────────────────────────────────────────────
|
|
||||||
RUN curl -sSL https://raw.githubusercontent.com/ros/rosdistro/master/ros.asc \
|
|
||||||
| gpg --dearmor -o /usr/share/keyrings/ros-archive-keyring.gpg && \
|
|
||||||
echo "deb [arch=arm64 signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] \
|
|
||||||
http://packages.ros.org/ros2/ubuntu jammy main" \
|
|
||||||
> /etc/apt/sources.list.d/ros2.list && \
|
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
ros-humble-ros-base \
|
|
||||||
python3-colcon-common-extensions \
|
|
||||||
python3-rosdep \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# ── Python AI deps (core) ─────────────────────────────────────────────────────
|
|
||||||
# faster-whisper: CTranslate2 backend, Orin GPU accelerated
|
|
||||||
# llama-cpp-python: GGUF quantized LLM, CUDA offload
|
|
||||||
# piper-tts: fast neural TTS, CPU/GPU
|
|
||||||
# insightface: SCRFD face detection + ArcFace recognition
|
|
||||||
# pyannote.audio: speaker diarization + ECAPA-TDNN embeddings
|
|
||||||
RUN pip3 install --no-cache-dir \
|
|
||||||
"faster-whisper>=1.0.0" \
|
|
||||||
"ctranslate2>=4.0.0" \
|
|
||||||
"openai-whisper>=20231117" \
|
|
||||||
"piper-tts>=1.2.0" \
|
|
||||||
"insightface>=0.7.3" \
|
|
||||||
"onnxruntime-gpu>=1.17.0" \
|
|
||||||
"pyannote.audio>=3.1.0" \
|
|
||||||
"speechbrain>=1.0.0"
|
|
||||||
|
|
||||||
# ── llama-cpp-python with CUDA ─────────────────────────────────────────────────
|
|
||||||
# Build with CUDA support for Orin GPU offload
|
|
||||||
RUN CMAKE_ARGS="-DLLAMA_CUDA=on -DCUDA_ARCHITECTURES=87" \
|
|
||||||
pip3 install --no-cache-dir llama-cpp-python==0.2.85 --no-binary llama-cpp-python
|
|
||||||
|
|
||||||
# ── Wake word / VAD ───────────────────────────────────────────────────────────
|
|
||||||
RUN pip3 install --no-cache-dir \
|
|
||||||
"openwakeword>=0.6.0" \
|
|
||||||
"silero-vad>=5.1.0" \
|
|
||||||
"webrtcvad-wheels>=2.0.14"
|
|
||||||
|
|
||||||
# ── Audio I/O ─────────────────────────────────────────────────────────────────
|
|
||||||
RUN pip3 install --no-cache-dir \
|
|
||||||
"pyaudio>=0.2.14" \
|
|
||||||
"sounddevice>=0.4.6" \
|
|
||||||
"soundfile>=0.12.1" \
|
|
||||||
"numpy>=1.24.0"
|
|
||||||
|
|
||||||
# ── TensorRT Python bindings ──────────────────────────────────────────────────
|
|
||||||
# Already available via JetPack; install pycuda for custom kernels
|
|
||||||
RUN pip3 install --no-cache-dir "pycuda>=2022.2.2"
|
|
||||||
|
|
||||||
# ── MQTT for SOUL/agent communication ─────────────────────────────────────────
|
|
||||||
RUN pip3 install --no-cache-dir "paho-mqtt>=2.0.0"
|
|
||||||
|
|
||||||
# ── ROS2 Python deps ──────────────────────────────────────────────────────────
|
|
||||||
RUN pip3 install --no-cache-dir \
|
|
||||||
"rclpy" \
|
|
||||||
"transforms3d>=0.4.1"
|
|
||||||
|
|
||||||
# ── Model directory ───────────────────────────────────────────────────────────
|
|
||||||
RUN mkdir -p /models/onnx /models/engines /models/gguf /models/piper
|
|
||||||
ENV MODEL_DIR=/models
|
|
||||||
ENV PIPER_VOICE_DIR=/models/piper
|
|
||||||
|
|
||||||
# ── ALSA config for USB mic + speaker ─────────────────────────────────────────
|
|
||||||
COPY config/asound.conf /etc/asound.conf
|
|
||||||
|
|
||||||
# ── Workspace ─────────────────────────────────────────────────────────────────
|
|
||||||
RUN mkdir -p /ros2_ws/src
|
|
||||||
WORKDIR /ros2_ws
|
|
||||||
|
|
||||||
COPY scripts/entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod +x /entrypoint.sh
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["bash"]
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
# ALSA config for social-bot audio
|
|
||||||
# Assumes USB mic (ReSpeaker or similar) at card 1, USB speaker at card 2
|
|
||||||
# Adjust card numbers with: aplay -l / arecord -l
|
|
||||||
|
|
||||||
# Default capture device — USB mic array (ReSpeaker 2-Mic or 4-Mic)
|
|
||||||
pcm.!default {
|
|
||||||
type asym
|
|
||||||
playback.pcm "usb_speaker"
|
|
||||||
capture.pcm "usb_mic"
|
|
||||||
}
|
|
||||||
|
|
||||||
# USB microphone input
|
|
||||||
pcm.usb_mic {
|
|
||||||
type plug
|
|
||||||
slave {
|
|
||||||
pcm "hw:1,0"
|
|
||||||
rate 16000
|
|
||||||
channels 1
|
|
||||||
format S16_LE
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# USB speaker output
|
|
||||||
pcm.usb_speaker {
|
|
||||||
type plug
|
|
||||||
slave {
|
|
||||||
pcm "hw:2,0"
|
|
||||||
rate 22050
|
|
||||||
channels 1
|
|
||||||
format S16_LE
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Mixer controls
|
|
||||||
ctl.!default {
|
|
||||||
type hw
|
|
||||||
card 1
|
|
||||||
}
|
|
||||||
@ -302,61 +302,6 @@ services:
|
|||||||
fence_active:=true
|
fence_active:=true
|
||||||
"
|
"
|
||||||
|
|
||||||
# ── Social-bot AI stack (speech + LLM + TTS + face recognition) ─────────────
|
|
||||||
# Issue #88: Orin dev environment for social-bot
|
|
||||||
# Start: docker compose up -d saltybot-social
|
|
||||||
# Logs: docker compose logs -f saltybot-social
|
|
||||||
saltybot-social:
|
|
||||||
image: saltybot/social:latest
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile.social
|
|
||||||
container_name: saltybot-social
|
|
||||||
restart: unless-stopped
|
|
||||||
runtime: nvidia
|
|
||||||
network_mode: host
|
|
||||||
depends_on:
|
|
||||||
- stm32-bridge
|
|
||||||
environment:
|
|
||||||
- NVIDIA_VISIBLE_DEVICES=all
|
|
||||||
- NVIDIA_DRIVER_CAPABILITIES=all,audio
|
|
||||||
- ROS_DOMAIN_ID=42
|
|
||||||
- RMW_IMPLEMENTATION=rmw_cyclonedds_cpp
|
|
||||||
- MODEL_DIR=/models
|
|
||||||
- PIPER_VOICE_DIR=/models/piper
|
|
||||||
- LLAMA_N_GPU_LAYERS=20
|
|
||||||
# Audio device routing (override if mic/speaker cards differ)
|
|
||||||
- ALSA_MIC_CARD=1
|
|
||||||
- ALSA_SPEAKER_CARD=2
|
|
||||||
volumes:
|
|
||||||
- ./ros2_ws/src:/ros2_ws/src:rw
|
|
||||||
- ./config:/config:ro
|
|
||||||
# Persistent model storage on NVMe (survives container restarts)
|
|
||||||
- /mnt/nvme/saltybot/models:/models:rw
|
|
||||||
# Enrollment database (face embeddings, voice prints, conversation history)
|
|
||||||
- /mnt/nvme/saltybot/social_db:/social_db:rw
|
|
||||||
# SOUL.md personality file
|
|
||||||
- /mnt/nvme/saltybot/soul:/soul:ro
|
|
||||||
devices:
|
|
||||||
# USB mic array (ReSpeaker 2-Mic / 4-Mic or compatible)
|
|
||||||
- /dev/snd:/dev/snd
|
|
||||||
# USB webcam / face camera (optional, RealSense preferred)
|
|
||||||
- /dev/bus/usb:/dev/bus/usb
|
|
||||||
group_add:
|
|
||||||
- audio
|
|
||||||
cap_add:
|
|
||||||
- SYS_NICE # allow real-time thread priority for audio
|
|
||||||
ulimits:
|
|
||||||
rtprio: 95
|
|
||||||
memlock: -1
|
|
||||||
command: >
|
|
||||||
bash -c "
|
|
||||||
source /opt/ros/humble/setup.bash &&
|
|
||||||
source /ros2_ws/install/local_setup.bash 2>/dev/null || true &&
|
|
||||||
ros2 launch saltybot_social social_bot.launch.py
|
|
||||||
"
|
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
saltybot-maps:
|
saltybot-maps:
|
||||||
driver: local
|
driver: local
|
||||||
|
|||||||
@ -1,79 +0,0 @@
|
|||||||
# Social-bot Model Directory
|
|
||||||
|
|
||||||
## Layout
|
|
||||||
|
|
||||||
```
|
|
||||||
/models/
|
|
||||||
├── onnx/ # Source ONNX models (version-pinned)
|
|
||||||
│ ├── scrfd_10g_bnkps.onnx # Face detection — InsightFace SCRFD-10GF
|
|
||||||
│ ├── arcface_r100.onnx # Face recognition — ArcFace R100 (buffalo_l)
|
|
||||||
│ └── ecapa_tdnn.onnx # Speaker embedding — ECAPA-TDNN (SpeechBrain export)
|
|
||||||
│
|
|
||||||
├── engines/ # TensorRT FP16 compiled engines
|
|
||||||
│ ├── scrfd_10g_fp16.engine # SCRFD → TRT FP16 (640×640)
|
|
||||||
│ ├── arcface_r100_fp16.engine # ArcFace → TRT FP16 (112×112)
|
|
||||||
│ └── ecapa_tdnn_fp16.engine # ECAPA-TDNN → TRT FP16 (variable len)
|
|
||||||
│
|
|
||||||
├── whisper-small-ct2/ # faster-whisper CTranslate2 format (auto-downloaded)
|
|
||||||
│ ├── model.bin
|
|
||||||
│ └── tokenizer.json
|
|
||||||
│
|
|
||||||
├── piper/ # Piper TTS voice models
|
|
||||||
│ ├── en_US-lessac-medium.onnx
|
|
||||||
│ └── en_US-lessac-medium.onnx.json
|
|
||||||
│
|
|
||||||
├── gguf/ # Quantized LLM (llama-cpp-python)
|
|
||||||
│ └── phi-3-mini-4k-instruct-q4_k_m.gguf # ~2.2GB — Phi-3-mini Q4_K_M
|
|
||||||
│
|
|
||||||
└── speechbrain_ecapa/ # SpeechBrain pretrained checkpoint cache
|
|
||||||
```
|
|
||||||
|
|
||||||
## Model Versions
|
|
||||||
|
|
||||||
| Model | Version | Source | Size |
|
|
||||||
|---|---|---|---|
|
|
||||||
| SCRFD-10GF | InsightFace 0.7 | GitHub releases | 17MB |
|
|
||||||
| ArcFace R100 (w600k_r50) | InsightFace buffalo_l | Auto via insightface | 166MB |
|
|
||||||
| ECAPA-TDNN | SpeechBrain spkrec-ecapa-voxceleb | HuggingFace | 87MB |
|
|
||||||
| Whisper small | faster-whisper 1.0+ | CTranslate2 hub | 488MB |
|
|
||||||
| Piper en_US-lessac-medium | Rhasspy piper-voices | HuggingFace | 63MB |
|
|
||||||
| Phi-3-mini-4k Q4_K_M | microsoft/Phi-3-mini-4k-instruct | GGUF / HuggingFace | 2.2GB |
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# From within the social container:
|
|
||||||
/scripts/convert_models.sh all # download + convert all models
|
|
||||||
/scripts/convert_models.sh benchmark # run latency benchmark suite
|
|
||||||
/scripts/convert_models.sh health # check GPU memory
|
|
||||||
```
|
|
||||||
|
|
||||||
## Performance Targets (Orin Nano Super, JetPack 6, FP16)
|
|
||||||
|
|
||||||
| Model | Input | Target | Typical |
|
|
||||||
|---|---|---|---|
|
|
||||||
| SCRFD-10GF | 640×640 | <15ms | ~8ms |
|
|
||||||
| ArcFace R100 | 4×112×112 | <5ms | ~3ms |
|
|
||||||
| ECAPA-TDNN | 1s audio | <20ms | ~12ms |
|
|
||||||
| Whisper small | 1s audio | <300ms | ~180ms |
|
|
||||||
| Piper lessac-medium | 10 words | <200ms | ~60ms |
|
|
||||||
| Phi-3-mini Q4_K_M | prompt | <500ms TTFT | ~350ms |
|
|
||||||
|
|
||||||
## LLM Download
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Download Phi-3-mini GGUF manually (2.2GB):
|
|
||||||
wget -O /models/gguf/phi-3-mini-4k-instruct-q4_k_m.gguf \
|
|
||||||
"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf"
|
|
||||||
|
|
||||||
# Or use llama-cpp-python's built-in download:
|
|
||||||
python3 -c "
|
|
||||||
from llama_cpp import Llama
|
|
||||||
llm = Llama.from_pretrained(
|
|
||||||
repo_id='microsoft/Phi-3-mini-4k-instruct-gguf',
|
|
||||||
filename='Phi-3-mini-4k-instruct-q4.gguf',
|
|
||||||
cache_dir='/models/gguf',
|
|
||||||
n_gpu_layers=20
|
|
||||||
)
|
|
||||||
"
|
|
||||||
```
|
|
||||||
@ -1,339 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# convert_models.sh — Convert social-bot ONNX models to TensorRT FP16 engines
|
|
||||||
#
|
|
||||||
# Models converted:
|
|
||||||
# SCRFD-10GF — face detection (InsightFace)
|
|
||||||
# ArcFace-R100 — face recognition (InsightFace)
|
|
||||||
# Whisper-small — STT via faster-whisper (handled by CTranslate2, not TRT)
|
|
||||||
# ECAPA-TDNN — speaker embedding (SpeechBrain export)
|
|
||||||
# Piper TTS — neural TTS (ONNX native, no TRT needed)
|
|
||||||
#
|
|
||||||
# Usage: ./scripts/convert_models.sh [--model all|scrfd|arcface|ecapa]
|
|
||||||
# Requirements: TensorRT 8.6+, trtexec in PATH, ONNX models in /models/onnx/
|
|
||||||
#
|
|
||||||
# Note: Whisper uses faster-whisper's CTranslate2 which handles its own GPU
|
|
||||||
# optimization. Piper uses ONNX Runtime directly (already optimized).
|
|
||||||
#
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
MODEL_DIR="${MODEL_DIR:-/models}"
|
|
||||||
ONNX_DIR="${MODEL_DIR}/onnx"
|
|
||||||
ENGINE_DIR="${MODEL_DIR}/engines"
|
|
||||||
WORKSPACE_MB=4096
|
|
||||||
TARGET="${1:-all}"
|
|
||||||
|
|
||||||
mkdir -p "${ENGINE_DIR}"
|
|
||||||
|
|
||||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
|
||||||
err() { echo "[ERROR] $*" >&2; exit 1; }
|
|
||||||
|
|
||||||
check_trtexec() {
|
|
||||||
command -v trtexec &>/dev/null || \
|
|
||||||
err "trtexec not found. Install TensorRT or add to PATH."
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── Download ONNX sources ──────────────────────────────────────────────────────
|
|
||||||
download_models() {
|
|
||||||
log "Downloading ONNX model sources..."
|
|
||||||
|
|
||||||
# SCRFD-10GF (InsightFace model zoo)
|
|
||||||
SCRFD_ONNX="${ONNX_DIR}/scrfd_10g_bnkps.onnx"
|
|
||||||
if [[ ! -f "${SCRFD_ONNX}" ]]; then
|
|
||||||
log "Downloading SCRFD-10GF..."
|
|
||||||
wget -q --show-progress -O "${SCRFD_ONNX}" \
|
|
||||||
"https://github.com/deepinsight/insightface/releases/download/v0.7/scrfd_10g_bnkps.onnx" || \
|
|
||||||
python3 -c "
|
|
||||||
import insightface, shutil, os
|
|
||||||
app = insightface.app.FaceAnalysis(providers=['CUDAExecutionProvider'])
|
|
||||||
app.prepare(ctx_id=0)
|
|
||||||
src = os.path.expanduser('~/.insightface/models/buffalo_l/det_10g.onnx')
|
|
||||||
if os.path.exists(src): shutil.copy(src, '${SCRFD_ONNX}')
|
|
||||||
"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ArcFace-R100 (InsightFace buffalo_l)
|
|
||||||
ARCFACE_ONNX="${ONNX_DIR}/arcface_r100.onnx"
|
|
||||||
if [[ ! -f "${ARCFACE_ONNX}" ]]; then
|
|
||||||
log "Downloading ArcFace-R100 via insightface..."
|
|
||||||
python3 -c "
|
|
||||||
import insightface, shutil, os
|
|
||||||
app = insightface.app.FaceAnalysis(providers=['CUDAExecutionProvider'])
|
|
||||||
app.prepare(ctx_id=0)
|
|
||||||
src = os.path.expanduser('~/.insightface/models/buffalo_l/w600k_r50.onnx')
|
|
||||||
if os.path.exists(src): shutil.copy(src, '${ARCFACE_ONNX}')
|
|
||||||
"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ECAPA-TDNN speaker embedding (SpeechBrain pretrained)
|
|
||||||
ECAPA_ONNX="${ONNX_DIR}/ecapa_tdnn.onnx"
|
|
||||||
if [[ ! -f "${ECAPA_ONNX}" ]]; then
|
|
||||||
log "Exporting ECAPA-TDNN to ONNX..."
|
|
||||||
python3 - <<'PYEOF'
|
|
||||||
import torch
|
|
||||||
from speechbrain.pretrained import EncoderClassifier
|
|
||||||
import os
|
|
||||||
|
|
||||||
model = EncoderClassifier.from_hparams(
|
|
||||||
source="speechbrain/spkrec-ecapa-voxceleb",
|
|
||||||
savedir="/models/speechbrain_ecapa"
|
|
||||||
)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
dummy = torch.randn(1, 16000) # 1s of audio
|
|
||||||
with torch.no_grad():
|
|
||||||
torch.onnx.export(
|
|
||||||
model.encode_batch.__self__,
|
|
||||||
dummy,
|
|
||||||
"/models/onnx/ecapa_tdnn.onnx",
|
|
||||||
opset_version=12,
|
|
||||||
input_names=["waveform"],
|
|
||||||
output_names=["embedding"],
|
|
||||||
dynamic_axes={"waveform": {1: "audio_len"}}
|
|
||||||
)
|
|
||||||
print("ECAPA-TDNN exported to /models/onnx/ecapa_tdnn.onnx")
|
|
||||||
PYEOF
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "ONNX models ready in ${ONNX_DIR}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── SCRFD face detection ───────────────────────────────────────────────────────
|
|
||||||
convert_scrfd() {
|
|
||||||
local ONNX="${ONNX_DIR}/scrfd_10g_bnkps.onnx"
|
|
||||||
local ENGINE="${ENGINE_DIR}/scrfd_10g_fp16.engine"
|
|
||||||
|
|
||||||
[[ -f "${ONNX}" ]] || err "SCRFD ONNX not found at ${ONNX}. Run with --download first."
|
|
||||||
|
|
||||||
log "Converting SCRFD-10GF → TRT FP16..."
|
|
||||||
trtexec \
|
|
||||||
--onnx="${ONNX}" \
|
|
||||||
--saveEngine="${ENGINE}" \
|
|
||||||
--fp16 \
|
|
||||||
--workspace=${WORKSPACE_MB} \
|
|
||||||
--minShapes=input.1:1x3x640x640 \
|
|
||||||
--optShapes=input.1:1x3x640x640 \
|
|
||||||
--maxShapes=input.1:4x3x640x640 \
|
|
||||||
--explicitBatch \
|
|
||||||
2>&1 | tail -5
|
|
||||||
|
|
||||||
log "SCRFD engine saved: ${ENGINE}"
|
|
||||||
benchmark_engine "${ENGINE}" "input.1:1x3x640x640"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── ArcFace recognition ────────────────────────────────────────────────────────
|
|
||||||
convert_arcface() {
|
|
||||||
local ONNX="${ONNX_DIR}/arcface_r100.onnx"
|
|
||||||
local ENGINE="${ENGINE_DIR}/arcface_r100_fp16.engine"
|
|
||||||
|
|
||||||
[[ -f "${ONNX}" ]] || err "ArcFace ONNX not found at ${ONNX}."
|
|
||||||
|
|
||||||
log "Converting ArcFace-R100 → TRT FP16..."
|
|
||||||
trtexec \
|
|
||||||
--onnx="${ONNX}" \
|
|
||||||
--saveEngine="${ENGINE}" \
|
|
||||||
--fp16 \
|
|
||||||
--workspace=${WORKSPACE_MB} \
|
|
||||||
--minShapes=input.1:1x3x112x112 \
|
|
||||||
--optShapes=input.1:4x3x112x112 \
|
|
||||||
--maxShapes=input.1:16x3x112x112 \
|
|
||||||
--explicitBatch \
|
|
||||||
2>&1 | tail -5
|
|
||||||
|
|
||||||
log "ArcFace engine saved: ${ENGINE}"
|
|
||||||
benchmark_engine "${ENGINE}" "input.1:4x3x112x112"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── ECAPA-TDNN speaker embedding ───────────────────────────────────────────────
|
|
||||||
convert_ecapa() {
|
|
||||||
local ONNX="${ONNX_DIR}/ecapa_tdnn.onnx"
|
|
||||||
local ENGINE="${ENGINE_DIR}/ecapa_tdnn_fp16.engine"
|
|
||||||
|
|
||||||
[[ -f "${ONNX}" ]] || err "ECAPA-TDNN ONNX not found at ${ONNX}."
|
|
||||||
|
|
||||||
log "Converting ECAPA-TDNN → TRT FP16..."
|
|
||||||
trtexec \
|
|
||||||
--onnx="${ONNX}" \
|
|
||||||
--saveEngine="${ENGINE}" \
|
|
||||||
--fp16 \
|
|
||||||
--workspace=${WORKSPACE_MB} \
|
|
||||||
--minShapes=waveform:1x8000 \
|
|
||||||
--optShapes=waveform:1x16000 \
|
|
||||||
--maxShapes=waveform:1x32000 \
|
|
||||||
--explicitBatch \
|
|
||||||
2>&1 | tail -5
|
|
||||||
|
|
||||||
log "ECAPA-TDNN engine saved: ${ENGINE}"
|
|
||||||
benchmark_engine "${ENGINE}" "waveform:1x16000"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── Whisper STT (CTranslate2 — no TRT conversion needed) ─────────────────────
|
|
||||||
setup_whisper() {
|
|
||||||
log "Setting up faster-whisper (CTranslate2 format)..."
|
|
||||||
local WHISPER_DIR="/models/whisper-small-ct2"
|
|
||||||
|
|
||||||
if [[ ! -d "${WHISPER_DIR}" ]]; then
|
|
||||||
python3 - <<'PYEOF'
|
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Download and cache the small model in CTranslate2 int8 format
|
|
||||||
# This is already GPU-optimized by CTranslate2 — no TRT conversion needed
|
|
||||||
model = WhisperModel("small", device="cuda", compute_type="float16",
|
|
||||||
download_root="/models")
|
|
||||||
print("faster-whisper 'small' model ready at /models/")
|
|
||||||
# Run one warmup inference
|
|
||||||
import numpy as np
|
|
||||||
audio = np.zeros(16000, dtype=np.float32)
|
|
||||||
segments, _ = model.transcribe(audio)
|
|
||||||
list(segments) # consume generator
|
|
||||||
print("Whisper warmup complete")
|
|
||||||
PYEOF
|
|
||||||
else
|
|
||||||
log "Whisper already downloaded, skipping."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── Piper TTS (ONNX Runtime — no TRT conversion needed) ──────────────────────
|
|
||||||
setup_piper() {
|
|
||||||
log "Setting up Piper TTS voice models..."
|
|
||||||
local PIPER_DIR="${MODEL_DIR}/piper"
|
|
||||||
mkdir -p "${PIPER_DIR}"
|
|
||||||
|
|
||||||
# Download en_US-lessac-medium (natural, good quality, ~60ms on Orin)
|
|
||||||
if [[ ! -f "${PIPER_DIR}/en_US-lessac-medium.onnx" ]]; then
|
|
||||||
python3 - <<'PYEOF'
|
|
||||||
import urllib.request, os
|
|
||||||
|
|
||||||
base = "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/"
|
|
||||||
piper_dir = "/models/piper"
|
|
||||||
|
|
||||||
for fname in ["en_US-lessac-medium.onnx", "en_US-lessac-medium.onnx.json"]:
|
|
||||||
dest = os.path.join(piper_dir, fname)
|
|
||||||
if not os.path.exists(dest):
|
|
||||||
print(f"Downloading {fname}...")
|
|
||||||
urllib.request.urlretrieve(base + fname, dest)
|
|
||||||
print(f" → {dest}")
|
|
||||||
print("Piper voice ready.")
|
|
||||||
PYEOF
|
|
||||||
else
|
|
||||||
log "Piper voice already downloaded, skipping."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── Benchmark helper ───────────────────────────────────────────────────────────
|
|
||||||
benchmark_engine() {
|
|
||||||
local ENGINE="$1"
|
|
||||||
local SHAPE="$2"
|
|
||||||
log "Benchmarking ${ENGINE##*/}..."
|
|
||||||
trtexec \
|
|
||||||
--loadEngine="${ENGINE}" \
|
|
||||||
--shapes="${SHAPE}" \
|
|
||||||
--warmUp=3 \
|
|
||||||
--avgRuns=20 \
|
|
||||||
--useSpinWait \
|
|
||||||
2>&1 | grep -E "mean|median|throughput|Latency" | head -5
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── Full benchmark suite ───────────────────────────────────────────────────────
|
|
||||||
run_benchmarks() {
|
|
||||||
log "=== Social-bot model benchmark suite ==="
|
|
||||||
log "Platform: $(cat /proc/device-tree/model 2>/dev/null || echo 'Orin')"
|
|
||||||
log "TensorRT: $(python3 -c 'import tensorrt as trt; print(trt.__version__)' 2>/dev/null)"
|
|
||||||
log "CUDA: $(nvcc --version 2>/dev/null | grep release | awk '{print $6}' | tr -d ,)"
|
|
||||||
echo
|
|
||||||
|
|
||||||
for ENGINE in "${ENGINE_DIR}"/*.engine; do
|
|
||||||
[[ -f "${ENGINE}" ]] || continue
|
|
||||||
NAME="${ENGINE##*/}"
|
|
||||||
case "${NAME}" in
|
|
||||||
scrfd*) SHAPE="input.1:1x3x640x640" ;;
|
|
||||||
arcface*)SHAPE="input.1:4x3x112x112" ;;
|
|
||||||
ecapa*) SHAPE="waveform:1x16000" ;;
|
|
||||||
*) SHAPE="" ;;
|
|
||||||
esac
|
|
||||||
[[ -z "${SHAPE}" ]] && continue
|
|
||||||
echo "── ${NAME} ──"
|
|
||||||
benchmark_engine "${ENGINE}" "${SHAPE}"
|
|
||||||
echo
|
|
||||||
done
|
|
||||||
|
|
||||||
log "Whisper (CTranslate2 GPU):"
|
|
||||||
python3 - <<'PYEOF'
|
|
||||||
import time, numpy as np
|
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
model = WhisperModel("small", device="cuda", compute_type="float16", download_root="/models")
|
|
||||||
audio = np.random.randn(16000).astype(np.float32)
|
|
||||||
# warmup
|
|
||||||
for _ in range(3):
|
|
||||||
list(model.transcribe(audio)[0])
|
|
||||||
# benchmark
|
|
||||||
times = []
|
|
||||||
for _ in range(10):
|
|
||||||
t = time.perf_counter()
|
|
||||||
list(model.transcribe(audio)[0])
|
|
||||||
times.append((time.perf_counter() - t) * 1000)
|
|
||||||
print(f" 1s audio: mean={sum(times)/len(times):.1f}ms, min={min(times):.1f}ms")
|
|
||||||
PYEOF
|
|
||||||
|
|
||||||
log "Piper TTS (ONNX Runtime):"
|
|
||||||
python3 - <<'PYEOF'
|
|
||||||
import time
|
|
||||||
from piper import PiperVoice
|
|
||||||
voice = PiperVoice.load("/models/piper/en_US-lessac-medium.onnx")
|
|
||||||
text = "Hello, I am Salty, your personal robot companion."
|
|
||||||
# warmup
|
|
||||||
for _ in range(3):
|
|
||||||
list(voice.synthesize_stream_raw(text))
|
|
||||||
# benchmark
|
|
||||||
times = []
|
|
||||||
for _ in range(10):
|
|
||||||
t = time.perf_counter()
|
|
||||||
audio_bytes = b"".join(voice.synthesize_stream_raw(text))
|
|
||||||
times.append((time.perf_counter() - t) * 1000)
|
|
||||||
print(f" '{text[:30]}...': mean={sum(times)/len(times):.1f}ms, min={min(times):.1f}ms")
|
|
||||||
PYEOF
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── tegrastats snapshot ────────────────────────────────────────────────────────
|
|
||||||
check_gpu_health() {
|
|
||||||
log "GPU / system health check:"
|
|
||||||
tegrastats --interval 1000 &
|
|
||||||
TPID=$!
|
|
||||||
sleep 2
|
|
||||||
kill ${TPID} 2>/dev/null
|
|
||||||
echo
|
|
||||||
log "GPU memory:"
|
|
||||||
nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu \
|
|
||||||
--format=csv,noheader,nounits 2>/dev/null || \
|
|
||||||
python3 -c "import pycuda.driver as drv; drv.init(); d=drv.Device(0); c=d.make_context(); print('Free:', drv.mem_get_info()[0]//1024//1024, 'MB'); c.pop()"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ── Main ───────────────────────────────────────────────────────────────────────
|
|
||||||
check_trtexec
|
|
||||||
|
|
||||||
case "${TARGET}" in
|
|
||||||
all)
|
|
||||||
download_models
|
|
||||||
convert_scrfd
|
|
||||||
convert_arcface
|
|
||||||
convert_ecapa
|
|
||||||
setup_whisper
|
|
||||||
setup_piper
|
|
||||||
check_gpu_health
|
|
||||||
run_benchmarks
|
|
||||||
;;
|
|
||||||
scrfd) download_models; convert_scrfd ;;
|
|
||||||
arcface) download_models; convert_arcface ;;
|
|
||||||
ecapa) download_models; convert_ecapa ;;
|
|
||||||
whisper) setup_whisper ;;
|
|
||||||
piper) setup_piper ;;
|
|
||||||
benchmark)run_benchmarks ;;
|
|
||||||
health) check_gpu_health ;;
|
|
||||||
download) download_models ;;
|
|
||||||
*)
|
|
||||||
echo "Usage: $0 [all|scrfd|arcface|ecapa|whisper|piper|benchmark|health|download]"
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
log "Done."
|
|
||||||
@ -1,33 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# install_systemd.sh — Install saltybot systemd services on Orin
|
|
||||||
# Run as root: sudo ./systemd/install_systemd.sh
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
REPO_DIR="$(dirname "${SCRIPT_DIR}")"
|
|
||||||
SYSTEMD_DIR="/etc/systemd/system"
|
|
||||||
DEPLOY_DIR="/opt/saltybot/jetson"
|
|
||||||
|
|
||||||
log() { echo "[install_systemd] $*"; }
|
|
||||||
|
|
||||||
[[ "$(id -u)" == "0" ]] || { echo "Run as root"; exit 1; }
|
|
||||||
|
|
||||||
# Deploy repo to /opt/saltybot/jetson
|
|
||||||
log "Deploying to ${DEPLOY_DIR}..."
|
|
||||||
mkdir -p "${DEPLOY_DIR}"
|
|
||||||
rsync -a --exclude='.git' --exclude='__pycache__' \
|
|
||||||
"${REPO_DIR}/" "${DEPLOY_DIR}/"
|
|
||||||
|
|
||||||
# Install service files
|
|
||||||
log "Installing systemd units..."
|
|
||||||
cp "${SCRIPT_DIR}/saltybot.target" "${SYSTEMD_DIR}/"
|
|
||||||
cp "${SCRIPT_DIR}/saltybot-social.service" "${SYSTEMD_DIR}/"
|
|
||||||
|
|
||||||
# Reload and enable
|
|
||||||
systemctl daemon-reload
|
|
||||||
systemctl enable saltybot.target
|
|
||||||
systemctl enable saltybot-social.service
|
|
||||||
|
|
||||||
log "Services installed. Start with:"
|
|
||||||
log " systemctl start saltybot-social"
|
|
||||||
log " journalctl -fu saltybot-social"
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
[Unit]
|
|
||||||
Description=Saltybot Social-Bot Stack (speech + LLM + TTS + face recognition)
|
|
||||||
Documentation=https://gitea.vayrette.com/seb/saltylab-firmware
|
|
||||||
After=docker.service network-online.target sound.target
|
|
||||||
Requires=docker.service
|
|
||||||
PartOf=saltybot.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
Restart=on-failure
|
|
||||||
RestartSec=10s
|
|
||||||
TimeoutStartSec=120s
|
|
||||||
TimeoutStopSec=30s
|
|
||||||
|
|
||||||
User=root
|
|
||||||
WorkingDirectory=/opt/saltybot/jetson
|
|
||||||
|
|
||||||
# Pull latest image before start (optional — comment out for air-gapped deploy)
|
|
||||||
ExecStartPre=-/usr/bin/docker compose -f docker-compose.yml pull saltybot-social
|
|
||||||
|
|
||||||
# Start only the social service (not the entire stack)
|
|
||||||
ExecStart=/usr/bin/docker compose -f docker-compose.yml up --no-recreate saltybot-social
|
|
||||||
|
|
||||||
ExecStop=/usr/bin/docker compose -f docker-compose.yml stop saltybot-social
|
|
||||||
|
|
||||||
# Logging
|
|
||||||
StandardOutput=journal
|
|
||||||
StandardError=journal
|
|
||||||
SyslogIdentifier=saltybot-social
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=saltybot.target multi-user.target
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
[Unit]
|
|
||||||
Description=Saltybot Full Stack Target
|
|
||||||
Documentation=https://gitea.vayrette.com/seb/saltylab-firmware
|
|
||||||
After=docker.service network-online.target
|
|
||||||
Requires=docker.service
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
Loading…
x
Reference in New Issue
Block a user