ibm-granite
/

granitelib-rag-gpt-oss-r1.0

Model card Files Files and versions

granitelib-rag-gpt-oss-r1.0 / run_vllm.sh

frreiss's picture

import (#1)

e76b4fb 2 months ago

1.36 kB

	#! /bin/bash

	################################################################################
	# Shell script that starts a copy of vLLM with a base model plus all the
	# available LoRA adapters in this repository.
	#
	# To run this script:
	# 1. Install an appropriate build of vLLM for your machine (`pip install vllm`)
	# 2. Install the Hugging Face CLI (`pip install -U "huggingface_hub[cli]"`)
	# 3. Download the intrinsics library by running:
	# hf download ibm-granite/rag-intrinsics-lib --local-dir ./rag-intrinsics-lib
	# 4. Edit the constants BASE_MODEL_NAME, BASE_MODEL_ORG, and PORT as needed
	# 5. Run this script from the root of your local copy of rag-intrinsics-lib.
	################################################################################

	BASE_MODEL_NAME=gpt-oss-20b
	BASE_MODEL_ORG=openai
	PORT=55555

	export VLLM_API_KEY=rag_intrinsics_1234

	# Find all LoRA adapters for the target base model.
	LORAS=""
	for item in "."/*; do
	# Remove the "./"
	name=$(basename -- "${item}")
	if [ -d "./${name}/${BASE_MODEL_NAME}/lora" ]; then
	LORAS+="${name}=./${name}/${BASE_MODEL_NAME}/lora "
	fi
	done


	CMD="vllm serve ${BASE_MODEL_ORG}/${BASE_MODEL_NAME} \
	--port ${PORT} \
	--gpu-memory-utilization 0.45 \
	--max-model-len 8192 \
	--enable-lora \
	--max_lora_rank 64 \
	--lora-modules $LORAS"

	echo $CMD
	$CMD