#!/bin/bash
# ARM64 + CUDA setup for NVIDIA DGX Spark

set -e

echo "Setting up ARM64 environment..."

# 1. Install ARM64-compatible PyTorch with CUDA 12.1
echo "[1/4] Installing PyTorch..."
pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0 \
  --index-url https://download.pytorch.org/whl/cu121

# 2. Build BitsAndBytes from source (no ARM64 wheels)
echo "[2/4] Building BitsAndBytes..."
sudo apt-get update
sudo apt-get install -y build-essential cmake libopenblas-dev
pip install bitsandbytes==0.43.0 --no-binary bitsandbytes

# 3. Install Transformers stack
echo "[3/4] Installing dependencies..."
pip install transformers==4.40.0 \
  datasets==2.18.0 \
  peft==0.10.0 \
  accelerate==0.28.0 \
  sentencepiece==0.2.0 \
  scikit-learn==1.4.1

# 4. Configure vLLM for ARM64
echo "[4/4] Configuring vLLM..."
export VLLM_USE_TRITON_FLASH_ATTN=0
export VLLM_ATTENTION_BACKEND=TORCH_SDPA
echo 'export VLLM_USE_TRITON_FLASH_ATTN=0' >> ~/.bashrc
echo 'export VLLM_ATTENTION_BACKEND=TORCH_SDPA' >> ~/.bashrc

pip install vllm==0.4.0.post1

# Verify
python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA: {torch.cuda.is_available()}')"
python -c "import bitsandbytes; print(f'BitsAndBytes: {bitsandbytes.__version__}')"

echo ""
echo "✓ ARM64 setup complete!"