--- base_model: - BAAI/bge-m3 --- ``` import numpy as np import torch from typing import List, Union, Dict from collections import defaultdict from transformers import AutoTokenizer from scipy.sparse import vstack, csr_matrix import onnxruntime as ort class ONNXInferenceModel: def __init__( self, model_path: str, tokenizer_path: str, max_length: int = 8192, use_fp16: bool = True, device: str = "cuda" ): self.max_length = max_length self.use_fp16 = use_fp16 self.device = device providers = [('CUDAExecutionProvider', { 'device_id': 0, 'arena_extend_strategy': 'kSameAsRequested', 'gpu_mem_limit': 5 * 1024 * 1024 * 1024, 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'do_copy_in_default_stream': True, })] so = ort.SessionOptions() so.enable_mem_pattern = True so.enable_mem_reuse = True so.add_session_config_entry("memory.enable_memory_arena_shrinkage", "cpu:0; gpu:0") so.add_session_config_entry('session.use_device_allocator_for_initializers', "1") so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL self.session = ort.InferenceSession(model_path, providers=providers, sess_options=so) self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True) def _process_token_weights(self, token_weights: np.ndarray, input_ids: list) -> csr_matrix: result = defaultdict(float) unused_tokens = set([ self.tokenizer.cls_token_id, self.tokenizer.eos_token_id, self.tokenizer.pad_token_id, self.tokenizer.unk_token_id, ]) for w, idx in zip(token_weights, input_ids): if idx not in unused_tokens and w > 0: result[idx] = max(result[idx], float(w)) indices = list(result.keys()) data = list(result.values()) return csr_matrix((data, ([0] * len(indices), indices)), shape=(1, self.tokenizer.vocab_size), dtype=np.float64) def encode(self, sentences: Union[List[str], str], batch_size: int = 12, return_dense: bool = True, return_sparse: bool = False) -> Dict[str, Union[List[np.ndarray], csr_matrix]]: if isinstance(sentences, str): sentences = [sentences] dense_embeddings = [] sparse_embeddings = [] for i in range(0, len(sentences), batch_size): batch = sentences[i:i+batch_size] inputs = self.tokenizer( batch, padding="longest", truncation=True, max_length=self.max_length, return_tensors="pt" ).to(self.device) ort_inputs = {k: v.cpu().numpy() for k, v in inputs.items()} ort_outputs = self.session.run(None, ort_inputs) if return_dense: batch_dense = ort_outputs[0] if self.use_fp16: batch_dense = batch_dense.astype(np.float16) dense_embeddings.extend(batch_dense) if return_sparse: sparse_vecs = ort_outputs[1] for j, input_ids in enumerate(inputs["input_ids"].cpu().numpy()): sparse_embeddings.append(self._process_token_weights(sparse_vecs[j], input_ids.tolist())) result = {} if return_dense: result["dense"] = dense_embeddings if return_sparse: result["sparse"] = vstack(sparse_embeddings) return result tokenizer_path = tokenizer_path model_path = model_path onnx_model = ONNXInferenceModel(model_path=model_path, tokenizer_path=tokenizer_path, use_fp16=True) sentences = ["Hi"'] embeddings = onnx_model.encode(sentences, return_dense=True, return_sparse=True) print(embeddings) {'dense': [array([-0.0251 , 0.03464 , -0.04285 , ..., -0.02548 , 0.004963, -0.034 ], dtype=float16) , 'sparse': <1x250002 sparse matrix of type '' with 7 stored elements in Compressed Sparse Row format>} ```