| """Custom Transformer module for jina-reranker-m0 that fixes image ordering for image-image pairs. | |
| The Qwen2VL processor extracts images from messages in iteration order. ST creates messages | |
| as [query_msg, doc_msg], but the chat template renders doc-first. For single-image pairs this | |
| is fine, but for image-image pairs the two images get swapped. This module swaps the pair | |
| elements so the processor extracts images in doc-first order, matching the template rendering. | |
| Since both elements render as identical <|image_pad|> tokens, the role swap is invisible. | |
| """ | |
| from __future__ import annotations | |
| from typing import Any | |
| from PIL import Image | |
| from sentence_transformers.base.modality import is_image_url_or_path | |
| from sentence_transformers.base.modules.transformer import Transformer | |
| def _is_image(item: Any) -> bool: | |
| return isinstance(item, Image.Image) or (isinstance(item, str) and is_image_url_or_path(item)) | |
| class JinaRerankerTransformer(Transformer): | |
| def preprocess( | |
| self, | |
| inputs: list, | |
| prompt: str | None = None, | |
| **kwargs, | |
| ) -> dict[str, Any]: | |
| # Swap image-image pairs so the processor extracts images in doc-first order, | |
| # matching the chat template's doc-first rendering. | |
| swapped = [] | |
| for item in inputs: | |
| if isinstance(item, (list, tuple)) and len(item) == 2 and _is_image(item[0]) and _is_image(item[1]): | |
| swapped.append((item[1], item[0])) | |
| else: | |
| swapped.append(item) | |
| return super().preprocess(swapped, prompt=prompt, **kwargs) | |