baidu
/

ERNIE-4.5-VL-28B-A3B-Base-PT

@@ -17,7 +17,6 @@
 import copy
 import io
 import os
-import re
 import math
 import random
 import requests
@@ -28,14 +27,13 @@ import threading
 import uuid
 import decord
 from shutil import copyfile
-from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from PIL.ExifTags import TAGS
 from collections import defaultdict
-from typing import Any, Dict, List, Union
 from pathlib import Path
 from tempfile import NamedTemporaryFile as ntf
@@ -52,7 +50,6 @@ from transformers.tokenization_utils_base import (
     PaddingStrategy,
     TextInput,
 )
-from transformers.utils import logging
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
@@ -82,132 +79,419 @@ from transformers.image_utils import (
 logger = logging.get_logger(__name__)
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-def ceil_by_factor(number: int, factor: int) -> int:
-    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-    return math.ceil(number / factor) * factor
-def floor_by_factor(number: int, factor: int) -> int:
-    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-    return math.floor(number / factor) * factor
-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = 28,
-    min_pixels: int = 4 * 28 * 28,
-    max_pixels: int = 16384 * 28 * 28,
-):
-    """
-    Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    MAX_RATIO = 200
-    if max(height, width) / min(height, width) > MAX_RATIO:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
-        logger.info(
-            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-        height = new_height
-        width = new_width
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-    return h_bar, w_bar
-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Checks to see whether the pixel values have already been rescaled to [0, 1].
-    """
-    if image.dtype == np.uint8:
-        return False
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-    Returns:
-        list: A list of images.
-    """
-    if (
-        isinstance(images, (list, tuple))
-        and isinstance(images[0], (list, tuple))
-        and is_valid_image(images[0][0])
-    ):
-        return [img for img_list in images for img in img_list]
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-    elif is_valid_image(images):
-        return [images]
-    raise ValueError(f"Could not make batched images from {images}")
-# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
-def make_batched_videos(videos) -> List[VideoInput]:
-    """dummy"""
-    if (
-        isinstance(videos, (list, tuple))
-        and isinstance(videos[0], (list, tuple))
-        and is_valid_image(videos[0][0])
-    ):
-        return videos
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
     raise ValueError(f"Could not make batched video from {videos}")
-class Ernie_45T_VLImageProcessor(BaseImageProcessor):
     r"""
     Constructs a adaptive image processor that dynamically resizes images based on the original images.
@@ -289,7 +573,7 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
                 isinstance(min_pixels, int) and min_pixels >= 0
             ), "min_pixels must be positive int"
             logger.info(
-                f"{msg} Ernie_45T_VLImageProcessor set min_pixels = {min_pixels}"
             )
             self.min_pixels = min_pixels
             self.size["min_pixels"] = int(min_pixels)
@@ -298,7 +582,7 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
                 isinstance(max_pixels, int) and max_pixels > 0
             ), "max_pixels must be positive int"
             logger.info(
-                f"{msg} Ernie_45T_VLImageProcessor set max_pixels = {max_pixels}"
             )
             self.max_pixels = max_pixels
             self.size["max_pixels"] = int(max_pixels)
@@ -618,298 +902,15 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
         return BatchFeature(data=data, tensor_type=return_tensors)
-class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
-    """
-    Ernie4_5_VLTokenizer
-    """
-    vocab_files_names = {
-        "vocab_file": "tokenizer.model",
-    }
-    # Model input names expected by the tokenizer
-    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
-    # Padding side (where to add padding tokens)
-    padding_side = "right"
-    def __init__(
-        self,
-        vocab_file,
-        bos_token="<s>",
-        cls_token="<cls>",
-        eos_token="</s>",
-        mask_token="<mask:0>",
-        pad_token="<pad>",
-        sep_token="<sep>",
-        unk_token="<unk>",
-        additional_special_tokens=None,
-        **kwargs,
-    ):
-        """
-        Initialize the Ernie4_5_VLTokenizer
-        Args:
-            vocab_file (str): Path to the tokenizer vocabulary model.
-            bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
-            cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
-            eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
-            mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
-            pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
-            sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
-            unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
-            additional_special_tokens (List[str], optional): Additional special tokens to use.
-                Defaults to `["<mask:1>", "<mask:7>"]`.
-            **kwargs (dict): Additional keyword arguments passed along to the superclass.
-        """
-        # Store vocabulary file path
-        self.vocab_file = vocab_file
-        # Initialize SentencePiece processor
-        self.sp_model = spm.SentencePieceProcessor()
-        # Load the vocabulary model
-        self.sp_model.Load(vocab_file)
-        # Set default additional special tokens if none provided
-        if additional_special_tokens is None:
-            additional_special_tokens = ["<mask:1>", "<mask:7>"]
-        super().__init__(
-            bos_token=bos_token,
-            cls_token=cls_token,
-            eos_token=eos_token,
-            mask_token=mask_token,
-            pad_token=pad_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-    @property
-    def space_token(self):
-        """Return the space token"""
-        return "<mask:1>"
-    @property
-    def space_token_id(self):
-        """Return the ID of the space token"""
-        return self.sp_model.piece_to_id("<mask:1>")
-    @property
-    def gend_token(self):
-        """Return the gender token"""
-        return "<mask:7>"
-    @property
-    def gend_token_id(self):
-        """Return the ID of the gender token"""
-        return self.sp_model.piece_to_id("<mask:7>")
-    @property
-    def im_start_id(self):
-        """Return the ID of the image start token"""
-        return self.sp_model.piece_to_id("<|im_start|>")
-    @property
-    def im_end_id(self):
-        """Return the ID of the image end token"""
-        return self.sp_model.piece_to_id("<|im_end|>")
-    @property
-    def vocab_size(self):
-        """Return the size of the vocabulary"""
-        return self.sp_model.vocab_size()
-    def get_vocab(self):
-        """Return the vocabulary as a dictionary mapping tokens to IDs"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-    def _tokenize(self, text):
-        """Tokenize the input text into pieces"""
-        return self.sp_model.encode_as_pieces(text)
-    def _convert_token_to_id(self, token):
-        """Convert a token to its corresponding ID"""
-        return self.sp_model.piece_to_id(token)
-    def _convert_id_to_token(self, id):
-        """Convert an ID to its corresponding token"""
-        return self.sp_model.id_to_piece(id)
-    def convert_tokens_to_string(self, tokens):
-        """Convert a sequence of tokens back to a string"""
-        current_sub_tokens = []
-        out_string = ""
-        for token in tokens:
-            # Handle special tokens differently
-            if token in self.all_special_tokens:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        # Add any remaining sub-tokens
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-    def prepare_for_model(self, *args, **kwargs):
-        """Prepare the tokenized inputs for the model"""
-        # Remove add_special_tokens if present (not supported)
-        if "add_special_tokens" in kwargs:
-            kwargs.pop("add_special_tokens")
-        return super().prepare_for_model(*args, **kwargs)
-    def save_vocabulary(
-        self, save_directory, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-        Args:
-            save_directory (`str`): The directory to save the vocabulary to
-            filename_prefix (`str`, optional): Prefix to add to the filename
-        Returns:
-            `Tuple(str)`: Paths to the saved files
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        # Construct output vocabulary file path
-        out_vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "")
-            + self.vocab_files_names["vocab_file"],
-        )
-        # Copy or create vocabulary file
-        if os.path.abspath(self.vocab_file) != os.path.abspath(
-            out_vocab_file
-        ) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-        return (out_vocab_file,)
-    def _decode(self, *args, **kwargs):
-        """Decode token_id back to text"""
-        # Remove some parameters that aren't used
-        kwargs.pop("clean_up_tokenization_spaces", None)
-        kwargs.pop("spaces_between_special_tokens", None)
-        # Call parent decode method with specific parameters
-        return super()._decode(
-            *args,
-            **kwargs,
-            clean_up_tokenization_spaces=False,
-            spaces_between_special_tokens=False,
-        )
-    def _pad(
-        self,
-        encoded_inputs: Dict,
-        max_length: Optional[int] = None,
-        padding_strategy=PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """Pad the encoded inputs to the specified length"""
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-        if return_attention_mask:
-            required_input = encoded_inputs[self.model_input_names[0]]
-            if padding_strategy == PaddingStrategy.LONGEST:
-                max_length = len(required_input)
-            # Adjust max_length if needed for multiple of padding
-            if (
-                max_length is not None
-                and pad_to_multiple_of is not None
-                and (max_length % pad_to_multiple_of != 0)
-            ):
-                max_length = (
-                    (max_length // pad_to_multiple_of) + 1
-                ) * pad_to_multiple_of
-            # Check if padding is needed
-            needs_to_be_padded = (
-                padding_strategy != PaddingStrategy.DO_NOT_PAD
-                and len(required_input) != max_length
-            )
-            # Handle attention mask if present
-            if (
-                "attention_mask" in encoded_inputs
-                and encoded_inputs["attention_mask"] is not None
-            ):
-                attention_mask = encoded_inputs.pop("attention_mask")
-                if isinstance(attention_mask, torch.Tensor):
-                    attention_mask = attention_mask.numpy()
-                elif isinstance(attention_mask, list):
-                    attention_mask = np.array(attention_mask)
-                elif not isinstance(attention_mask, np.ndarray):
-                    raise ValueError(
-                        f"Unexpected type {type(attention_mask)} of attention_mask, "
-                    )
-            else:
-                # Create default attention mask if none provided
-                attention_mask = np.tril(
-                    np.ones((len(required_input), len(required_input)), dtype=np.int64)
-                )
-                attention_mask = np.expand_dims(attention_mask, axis=0)
-            # Perform padding if needed
-            if needs_to_be_padded:
-                difference = max_length - len(required_input)
-                if self.padding_side == "right":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(0, difference)]
-                    else:
-                        pad_width = [(0, 0), (0, difference), (0, difference)]
-                elif self.padding_side == "left":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(difference, 0)]
-                    else:
-                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
-                else:
-                    raise ValueError(
-                        "Invalid padding strategy:" + str(self.padding_side)
-                    )
-                attention_mask = np.pad(
-                    attention_mask,
-                    pad_width=pad_width,
-                    mode="constant",
-                    constant_values=0,
-                )
-        # Call parent padding method
-        encoded_inputs = super()._pad(
-            encoded_inputs,
-            max_length,
-            padding_strategy=padding_strategy,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=False,
-        )
-        # Add attention mask back if needed
-        if return_attention_mask:
-            encoded_inputs["attention_mask"] = attention_mask.tolist()
-        return encoded_inputs
 RAW_VIDEO_DIR = "./download_tmp/raw_video/"
 RAW_IMAGE_DIR = "./download_tmp/raw_images/"
 EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
 TMP_DIR = "./download_tmp/upload_tmp/"
 FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
 def is_gif(data: bytes) -> bool:
@@ -1380,7 +1381,7 @@ def render_frame_timestamp(frame, timestamp, font_rate=0.1):
 IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
-class Ernie_45T_VLProcessor(ProcessorMixin):
     """
     Processes multimodal chat messages into model-ready inputs,
     handling text, images, and videos with 3D positional embeddings.
@@ -1527,11 +1528,11 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
     def __call__(
         self,
-        text: List[str],
-        images: List[Image.Image],
-        videos: List[List[Image.Image]],
         **kwargs,
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
         """
         Convert chat messages into model inputs.
         Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
@@ -1547,6 +1548,9 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
             "pic_cnt": 0,
             "video_cnt": 0,
         }
         texts = text[0]
         new_video_seg = True
@@ -1811,4 +1815,4 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
         return list(tokenizer_input_names) + list(image_processor_input_names)
-__all__ = ["Ernie_45T_VLImageProcessor", "Ernie4_5_VLTokenizer", "Ernie_45T_VLProcessor"]

 import copy
 import io
 import os
 import math
 import random
 import requests
 import uuid
 import decord
 from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from PIL.ExifTags import TAGS
 from collections import defaultdict
 from pathlib import Path
 from tempfile import NamedTemporaryFile as ntf
     PaddingStrategy,
     TextInput,
 )
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
 logger = logging.get_logger(__name__)
+class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
+    """
+    Ernie4_5_VLTokenizer
+    """
+    vocab_files_names = {
+        "vocab_file": "tokenizer.model",
+    }
+    # Model input names expected by the tokenizer
+    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
+    # Padding side (where to add padding tokens)
+    padding_side = "right"
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        cls_token="<cls>",
+        eos_token="</s>",
+        mask_token="<mask:0>",
+        pad_token="<pad>",
+        sep_token="<sep>",
+        unk_token="<unk>",
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        """
+        Initialize the Ernie4_5_VLTokenizer
+        Args:
+            vocab_file (str): Path to the tokenizer vocabulary model.
+            bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
+            cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
+            eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
+            mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
+            pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
+            sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
+            unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
+            additional_special_tokens (List[str], optional): Additional special tokens to use.
+                Defaults to `["<mask:1>", "<mask:7>"]`.
+            **kwargs (dict): Additional keyword arguments passed along to the superclass.
+        """
+        # Store vocabulary file path
+        self.vocab_file = vocab_file
+        # Initialize SentencePiece processor
+        self.sp_model = spm.SentencePieceProcessor()
+        # Load the vocabulary model
+        self.sp_model.Load(vocab_file)
+        # Set default additional special tokens if none provided
+        if additional_special_tokens is None:
+            additional_special_tokens = ["<mask:1>", "<mask:7>"]
+        super().__init__(
+            bos_token=bos_token,
+            cls_token=cls_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+    @property
+    def space_token(self):
+        """Return the space token"""
+        return "<mask:1>"
+    @property
+    def space_token_id(self):
+        """Return the ID of the space token"""
+        return self.sp_model.piece_to_id("<mask:1>")
+    @property
+    def gend_token(self):
+        """Return the gender token"""
+        return "<mask:7>"
+    @property
+    def gend_token_id(self):
+        """Return the ID of the gender token"""
+        return self.sp_model.piece_to_id("<mask:7>")
+    @property
+    def im_start_id(self):
+        """Return the ID of the image start token"""
+        return self.sp_model.piece_to_id("<|im_start|>")
+    @property
+    def im_end_id(self):
+        """Return the ID of the image end token"""
+        return self.sp_model.piece_to_id("<|im_end|>")
+    @property
+    def vocab_size(self):
+        """Return the size of the vocabulary"""
+        return self.sp_model.vocab_size()
+    def get_vocab(self):
+        """Return the vocabulary as a dictionary mapping tokens to IDs"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Tokenize the input text into pieces"""
+        return self.sp_model.encode_as_pieces(text)
+    def _convert_token_to_id(self, token):
+        """Convert a token to its corresponding ID"""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, id):
+        """Convert an ID to its corresponding token"""
+        return self.sp_model.id_to_piece(id)
+    def convert_tokens_to_string(self, tokens):
+        """Convert a sequence of tokens back to a string"""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # Handle special tokens differently
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        # Add any remaining sub-tokens
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def prepare_for_model(self, *args, **kwargs):
+        """Prepare the tokenized inputs for the model"""
+        # Remove add_special_tokens if present (not supported)
+        if "add_special_tokens" in kwargs:
+            kwargs.pop("add_special_tokens")
+        return super().prepare_for_model(*args, **kwargs)
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`): The directory to save the vocabulary to
+            filename_prefix (`str`, optional): Prefix to add to the filename
+        Returns:
+            `Tuple(str)`: Paths to the saved files
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        # Construct output vocabulary file path
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.vocab_files_names["vocab_file"],
+        )
+        # Copy or create vocabulary file
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def _decode(self, *args, **kwargs):
+        """Decode token_id back to text"""
+        # Remove some parameters that aren't used
+        kwargs.pop("clean_up_tokenization_spaces", None)
+        kwargs.pop("spaces_between_special_tokens", None)
+        # Call parent decode method with specific parameters
+        return super()._decode(
+            *args,
+            **kwargs,
+            clean_up_tokenization_spaces=False,
+            spaces_between_special_tokens=False,
+        )
+    def _pad(
+        self,
+        encoded_inputs: Dict,
+        max_length: Optional[int] = None,
+        padding_strategy=PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs
+    ) -> dict:
+        """Pad the encoded inputs to the specified length"""
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        if return_attention_mask:
+            required_input = encoded_inputs[self.model_input_names[0]]
+            if padding_strategy == PaddingStrategy.LONGEST:
+                max_length = len(required_input)
+            # Adjust max_length if needed for multiple of padding
+            if (
+                max_length is not None
+                and pad_to_multiple_of is not None
+                and (max_length % pad_to_multiple_of != 0)
+            ):
+                max_length = (
+                    (max_length // pad_to_multiple_of) + 1
+                ) * pad_to_multiple_of
+            # Check if padding is needed
+            needs_to_be_padded = (
+                padding_strategy != PaddingStrategy.DO_NOT_PAD
+                and len(required_input) != max_length
+            )
+            # Handle attention mask if present
+            if (
+                "attention_mask" in encoded_inputs
+                and encoded_inputs["attention_mask"] is not None
+            ):
+                attention_mask = encoded_inputs.pop("attention_mask")
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = attention_mask.numpy()
+                elif isinstance(attention_mask, list):
+                    attention_mask = np.array(attention_mask)
+                elif not isinstance(attention_mask, np.ndarray):
+                    raise ValueError(
+                        f"Unexpected type {type(attention_mask)} of attention_mask, "
+                    )
+            else:
+                # Create default attention mask if none provided
+                attention_mask = np.tril(
+                    np.ones((len(required_input), len(required_input)), dtype=np.int64)
+                )
+                attention_mask = np.expand_dims(attention_mask, axis=0)
+            # Perform padding if needed
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if self.padding_side == "right":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(0, difference)]
+                    else:
+                        pad_width = [(0, 0), (0, difference), (0, difference)]
+                elif self.padding_side == "left":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(difference, 0)]
+                    else:
+                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
+                else:
+                    raise ValueError(
+                        "Invalid padding strategy:" + str(self.padding_side)
+                    )
+                attention_mask = np.pad(
+                    attention_mask,
+                    pad_width=pad_width,
+                    mode="constant",
+                    constant_values=0,
+                )
+        # Call parent padding method
+        encoded_inputs = super()._pad(
+            encoded_inputs,
+            max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=False,
+        )
+        # Add attention mask back if needed
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = attention_mask.tolist()
+        return encoded_inputs
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 4 * 28 * 28,
+    max_pixels: int = 16384 * 28 * 28,
+):
+    """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    MAX_RATIO = 200
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        if height > width:
+            new_width = max(factor, round_by_factor(width, factor))
+            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
+        else:
+            new_height = max(factor, round_by_factor(height, factor))
+            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
+        logger.info(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
+              resize to {max(new_height, new_width) / min(new_height, new_width)}"
+        )
+        height = new_height
+        width = new_width
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
+        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+    return h_bar, w_bar
+def is_scaled_image(image: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if (
+        isinstance(images, (list, tuple))
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    """dummy"""
+    if (
+        isinstance(videos, (list, tuple))
+        and isinstance(videos[0], (list, tuple))
+        and is_valid_image(videos[0][0])
+    ):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
     raise ValueError(f"Could not make batched video from {videos}")
+class Ernie4_5_VLImageProcessor(BaseImageProcessor):
     r"""
     Constructs a adaptive image processor that dynamically resizes images based on the original images.
                 isinstance(min_pixels, int) and min_pixels >= 0
             ), "min_pixels must be positive int"
             logger.info(
+                f"{msg} Ernie4_5_VLImageProcessor set min_pixels = {min_pixels}"
             )
             self.min_pixels = min_pixels
             self.size["min_pixels"] = int(min_pixels)
                 isinstance(max_pixels, int) and max_pixels > 0
             ), "max_pixels must be positive int"
             logger.info(
+                f"{msg} Ernie4_5_VLImageProcessor set max_pixels = {max_pixels}"
             )
             self.max_pixels = max_pixels
             self.size["max_pixels"] = int(max_pixels)
         return BatchFeature(data=data, tensor_type=return_tensors)
 RAW_VIDEO_DIR = "./download_tmp/raw_video/"
 RAW_IMAGE_DIR = "./download_tmp/raw_images/"
 EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
 TMP_DIR = "./download_tmp/upload_tmp/"
 FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
+if not os.path.exists(FONT_PATH):
+    ttf = requests.get("https://paddlenlp.bj.bcebos.com/vision-language-models/materials/Roboto-Regular.ttf")
+    open(FONT_PATH, "wb").write(ttf.content)
 def is_gif(data: bytes) -> bool:
 IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
+class Ernie4_5_VLProcessor(ProcessorMixin):
     """
     Processes multimodal chat messages into model-ready inputs,
     handling text, images, and videos with 3D positional embeddings.
     def __call__(
         self,
+        text: Union[str, List[str]],
+        images: List[Image.Image] = [],
+        videos: List[List[Image.Image]] = [],
         **kwargs,
+    ) -> BatchFeature:
         """
         Convert chat messages into model inputs.
         Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
             "pic_cnt": 0,
             "video_cnt": 0,
         }
+        if not isinstance(text, list):
+            text = [text]
         texts = text[0]
         new_video_seg = True
         return list(tokenizer_input_names) + list(image_processor_input_names)
+__all__ = ["Ernie4_5_VLTokenizer", "Ernie4_5_VLImageProcessor", "Ernie4_5_VLProcessor"]