SeanLee97 commited on
Commit
3ea9d4d
·
1 Parent(s): 7c3c865

Integrate with Sentence Transformers v5.4 (#9)

Browse files

- Integrate with Sentence Transformers v5.4.0 (062adb650ef980633805fa8f6fbc9481e6559fef)
- Add missing </b> (6cb650209ce3f8a3d65162af17b3f1084e15d18e)
- Rename CausalScoreHead to LogitScore (462e1b8c1348562525b2096966ee58151e5b352a)
- Move 'message_format' into 'modality_config' under 'message' -> 'format' (2f26065a0f46dd5ea2809cb2f93cedb359d77093)
- Add missing README changes (6f9c43608d5a9063a85df115b3e40398ce5f6a5e)
- Merge branch 'main' of https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v2 (3472e9d1489bb31112962bf8faa59faa855081d6)

1_LogitScore/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "true_token_id": 16,
3
+ "false_token_id": 15
4
+ }
README.md CHANGED
@@ -121,6 +121,8 @@ language_bcp47:
121
  - zh-Hant
122
  license: apache-2.0
123
  pipeline_tag: text-ranking
 
 
124
  ---
125
 
126
  <br><br>
@@ -134,7 +136,7 @@ pipeline_tag: text-ranking
134
  </p>
135
 
136
  <p align="center">
137
- <sup> 🍞 Looking for a simple end-to-end retrieval solution? Meet Omni, our multimodal and multilingual model. <a href="https://mixedbread.com"><b>Get in touch for access.</a> </sup>
138
  </p>
139
 
140
  # 🍞 mxbai-rerank-base-v2 (a.k.a ProRank-0.5B)
@@ -160,6 +162,40 @@ We have two models:
160
 
161
  ## ⚙️ Usage
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  1. Install mxbai-rerank
164
 
165
  ```bash
 
121
  - zh-Hant
122
  license: apache-2.0
123
  pipeline_tag: text-ranking
124
+ tags:
125
+ - sentence-transformers
126
  ---
127
 
128
  <br><br>
 
136
  </p>
137
 
138
  <p align="center">
139
+ <sup> 🍞 Looking for a simple end-to-end retrieval solution? Meet Omni, our multimodal and multilingual model. <a href="https://mixedbread.com"><b>Get in touch for access.</b></a> </sup>
140
  </p>
141
 
142
  # 🍞 mxbai-rerank-base-v2 (a.k.a ProRank-0.5B)
 
162
 
163
  ## ⚙️ Usage
164
 
165
+ ### Using Sentence Transformers
166
+
167
+ Install Sentence Transformers:
168
+ ```bash
169
+ pip install sentence_transformers
170
+ ```
171
+
172
+ ```python
173
+ from sentence_transformers import CrossEncoder
174
+
175
+ model = CrossEncoder("mixedbread-ai/mxbai-rerank-base-v2")
176
+
177
+ query = "Who wrote 'To Kill a Mockingbird'?"
178
+ documents = [
179
+ "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature.",
180
+ "The novel 'Moby-Dick' was written by Herman Melville and first published in 1851. It is considered a masterpiece of American literature and deals with complex themes of obsession, revenge, and the conflict between good and evil.",
181
+ "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961.",
182
+ "Jane Austen was an English novelist known primarily for her six major novels, which interpret, critique and comment upon the British landed gentry at the end of the 18th century.",
183
+ "The 'Harry Potter' series, which consists of seven fantasy novels written by British author J.K. Rowling, is among the most popular and critically acclaimed books of the modern era.",
184
+ "'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan.",
185
+ ]
186
+
187
+ pairs = [(query, doc) for doc in documents]
188
+ scores = model.predict(pairs)
189
+ print(scores)
190
+ # [9.750735 1.3697281 8.080784 2.6611633 0.8729458 0.39884853]
191
+
192
+ rankings = model.rank(query, documents)
193
+ print(rankings)
194
+ # [{'corpus_id': 0, 'score': np.float32(9.750735)}, {'corpus_id': 2, 'score': np.float32(8.080784)}, {'corpus_id': 3, 'score': np.float32(2.6611633)}, {'corpus_id': 1, 'score': np.float32(1.3697281)}, {'corpus_id': 4, 'score': np.float32(0.8729458)}, {'corpus_id': 5, 'score': np.float32(0.39884853)}]
195
+ ```
196
+
197
+ ### Using mxbai-rerank
198
+
199
  1. Install mxbai-rerank
200
 
201
  ```bash
chat_template.jinja ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ <|im_start|>system
2
+ You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
3
+ <|im_start|>user
4
+ query: {{ (messages | selectattr("role", "eq", "query") | first).content }}
5
+ document: {{ (messages | selectattr("role", "eq", "document") | first).content }}
6
+ You are a search relevance expert who evaluates how well documents match search queries. For each query-document pair, carefully analyze the semantic relationship between them, then provide your binary relevance judgment (0 for not relevant, 1 for relevant).
7
+ Relevance:<|im_end|>
8
+ <|im_start|>assistant
9
+
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "CrossEncoder",
3
+ "__version__": {
4
+ "sentence_transformers": "5.4.0",
5
+ "transformers": "4.52.0",
6
+ "pytorch": "2.7.0+cu126"
7
+ },
8
+ "prompts": {},
9
+ "default_prompt_name": null
10
+ }
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.transformer.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_LogitScore",
12
+ "type": "sentence_transformers.cross_encoder.modules.logit_score.LogitScore"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "text-generation",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "logits"
7
+ },
8
+ "message": {
9
+ "method": "forward",
10
+ "method_output_name": "logits",
11
+ "format": "flat"
12
+ }
13
+ },
14
+ "module_output_name": "causal_logits"
15
+ }
tokenizer_config.json CHANGED
@@ -195,7 +195,6 @@
195
  "<|video_pad|>"
196
  ],
197
  "bos_token": null,
198
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
  "clean_up_tokenization_spaces": false,
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
 
195
  "<|video_pad|>"
196
  ],
197
  "bos_token": null,
 
198
  "clean_up_tokenization_spaces": false,
199
  "eos_token": "<|im_end|>",
200
  "errors": "replace",