Add CM3P model

Browse files

Files changed (6) hide show

audio_feature_extractor/preprocessor_config.json +3 -0
beatmap_parser/preprocessor_config.json +3 -0
beatmap_tokenizer/tokenizer_config.json +3 -0
metadata_tokenizer/tokenizer_config.json +3 -0
processing_cm3p.py +144 -13
processor_config.json +33 -33

audio_feature_extractor/preprocessor_config.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
   "chunk_length": 30,
   "dither": 0.0,
   "feature_extractor_type": "WhisperFeatureExtractor",

 {
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
   "chunk_length": 30,
   "dither": 0.0,
   "feature_extractor_type": "WhisperFeatureExtractor",

beatmap_parser/preprocessor_config.json CHANGED Viewed

@@ -8,6 +8,9 @@
   "add_sv": true,
   "add_timing": true,
   "add_timing_points": true,
   "feature_extractor_type": "CM3PBeatmapParser",
   "mania_bpm_normalized_scroll_speed": true,
   "processor_class": "CM3PProcessor",

   "add_sv": true,
   "add_timing": true,
   "add_timing_points": true,
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
   "feature_extractor_type": "CM3PBeatmapParser",
   "mania_bpm_normalized_scroll_speed": true,
   "processor_class": "CM3PProcessor",

beatmap_tokenizer/tokenizer_config.json CHANGED Viewed

@@ -87,6 +87,9 @@
     "[AUDIO_EOS]",
     "[AUDIO]"
   ],
   "bos_token": "[BOS]",
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",

     "[AUDIO_EOS]",
     "[AUDIO]"
   ],
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
   "bos_token": "[BOS]",
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",

metadata_tokenizer/tokenizer_config.json CHANGED Viewed

@@ -162,6 +162,9 @@
     "[SCROLL_SPEED_RATIO_UNK]",
     "[TAG_UNK]"
   ],
   "bos_token": "[BOS]",
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",

     "[SCROLL_SPEED_RATIO_UNK]",
     "[TAG_UNK]"
   ],
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
   "bos_token": "[BOS]",
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",

processing_cm3p.py CHANGED Viewed

@@ -7,11 +7,14 @@ from pathlib import Path
 from typing import Optional, Union, IO, TypedDict
 import numpy as np
 from pandas import Series
 from slider import Beatmap, HoldNote
 from transformers import WhisperFeatureExtractor, AutoProcessor, BatchEncoding
-from transformers.tokenization_utils_base import TruncationStrategy
 from transformers.utils import is_torch_available, PaddingStrategy, PROCESSOR_NAME, logging
 from .configuration_cm3p import CM3PConfig
 from .parsing_cm3p import CM3PBeatmapParser, load_beatmap, get_song_length
@@ -132,6 +135,7 @@ class CM3PTokenizerKwargs(TypedDict, total=False):
 class CM3PBeatmapKwargs(CM3PTokenizerKwargs, total=False):
     window_length_sec: float
     window_stride_sec: float
 class CM3PAudioKwargs(AudioKwargs, total=False):
@@ -139,6 +143,7 @@ class CM3PAudioKwargs(AudioKwargs, total=False):
     hop_length: Optional[int]
     window_size: Optional[int]
     audio_length_per_tok: Optional[int]
 # noinspection PyTypedDict
@@ -166,6 +171,7 @@ class CM3PProcessorKwargs(CommonKwargs, CM3PBeatmapKwargs, CM3PTokenizerKwargs,
             "hop_length": 160,
             "window_size": 400,
             "audio_length_per_tok": 8,
         },
         "common_kwargs": {
             "return_tensors": "pt",
@@ -558,7 +564,7 @@ class CM3PProcessor(ProcessorMixin):
                     **beatmap_kwargs,
                 )
-                if audio is not None:
                     data = dict(beatmap_encoding)
                     data["input_features"] = self._retrieve_input_features(batch_audio, **audio_kwargs)
                     beatmap_encoding = BatchFeature(data, tensor_type=return_tensors)
@@ -572,7 +578,7 @@ class CM3PProcessor(ProcessorMixin):
                     },
                     tensor_type=return_tensors,
                 )
-                if audio is not None:
                     data = dict(beatmap_encoding)
                     data["input_features"] = torch.zeros((0, self.audio_feature_extractor.feature_size, max_source_positions), dtype=torch.float) if return_tensors == "pt" else []
                     beatmap_encoding = BatchFeature(data, tensor_type=return_tensors)
@@ -651,36 +657,91 @@ class CM3PProcessor(ProcessorMixin):
         return self.beatmap_tokenizer.decode(*args, **kwargs)
     def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
         os.makedirs(save_directory, exist_ok=True)
         for attribute_name in self.attributes:
             attribute = getattr(self, attribute_name)
-            # Include the processor class in the attribute config so this processor can then be reloaded with the
-            # `AutoProcessor` API.
             if hasattr(attribute, "_set_processor_class"):
                 # noinspection PyProtectedMember
                 attribute._set_processor_class(self.__class__.__name__)
             attribute.save_pretrained(os.path.join(save_directory, attribute_name))
         output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
-        self.to_json_file(output_processor_file)
-        # noinspection PyUnresolvedReferences
-        logger.warning_once(f"processor saved in {output_processor_file}")
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id = self._create_repo(repo_id, **kwargs)
-            files_timestamps = self._get_files_timestamps(save_directory)
             self._upload_modified_files(
                 save_directory,
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
                 token=kwargs.get("token"),
             )
         return [output_processor_file]
     @classmethod
@@ -700,6 +761,76 @@ class CM3PProcessor(ProcessorMixin):
         return args
 AutoProcessor.register(CM3PConfig, CM3PProcessor)
 __all__ = ["CM3PProcessor", "get_metadata"]

 from typing import Optional, Union, IO, TypedDict
 import numpy as np
+from huggingface_hub.errors import HfHubHTTPError
 from pandas import Series
 from slider import Beatmap, HoldNote
 from transformers import WhisperFeatureExtractor, AutoProcessor, BatchEncoding
+from transformers.dynamic_module_utils import custom_object_save
+from transformers.tokenization_utils_base import TruncationStrategy, PreTrainedTokenizerBase
 from transformers.utils import is_torch_available, PaddingStrategy, PROCESSOR_NAME, logging
+from huggingface_hub import CommitOperationAdd, create_branch, create_commit
 from .configuration_cm3p import CM3PConfig
 from .parsing_cm3p import CM3PBeatmapParser, load_beatmap, get_song_length
 class CM3PBeatmapKwargs(CM3PTokenizerKwargs, total=False):
     window_length_sec: float
     window_stride_sec: float
+    min_window_length_sec: float
 class CM3PAudioKwargs(AudioKwargs, total=False):
     hop_length: Optional[int]
     window_size: Optional[int]
     audio_length_per_tok: Optional[int]
+    device: Optional[str]
 # noinspection PyTypedDict
             "hop_length": 160,
             "window_size": 400,
             "audio_length_per_tok": 8,
+            "device": "cpu",
         },
         "common_kwargs": {
             "return_tensors": "pt",
                     **beatmap_kwargs,
                 )
+                if all(a is not None for a in audio):
                     data = dict(beatmap_encoding)
                     data["input_features"] = self._retrieve_input_features(batch_audio, **audio_kwargs)
                     beatmap_encoding = BatchFeature(data, tensor_type=return_tensors)
                     },
                     tensor_type=return_tensors,
                 )
+                if all(a is not None for a in audio):
                     data = dict(beatmap_encoding)
                     data["input_features"] = torch.zeros((0, self.audio_feature_extractor.feature_size, max_source_positions), dtype=torch.float) if return_tensors == "pt" else []
                     beatmap_encoding = BatchFeature(data, tensor_type=return_tensors)
         return self.beatmap_tokenizer.decode(*args, **kwargs)
     def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
+        """
+        Save processor and its sub-components, with support for AutoProcessor remote code.
+        This is a lightly adapted version of ProcessorMixin.save_pretrained:
+        - child attributes are saved into subfolders (audio_feature_extractor/, beatmap_parser/, ...);
+        - when self._auto_class is set (via register_for_auto_class), custom_object_save is used
+          so that auto_map and dynamic modules are written correctly.
+        """
         os.makedirs(save_directory, exist_ok=True)
+        # Handle Hub integration (same as ProcessorMixin / your existing code)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+        else:
+            commit_message = None
+            repo_id = None
+            files_timestamps = None
+        # If we have a custom processor registered for an Auto class,
+        # save its code and dependencies as a dynamic module and
+        # populate the auto_map field in processor_config.json.
+        if self._auto_class is not None:
+            attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
+            # For tokenizers, we pass their init_kwargs; for other objects, we pass the object itself.
+            configs = []
+            for a in attrs:
+                if isinstance(a, PreTrainedTokenizerBase):
+                    configs.append(a.init_kwargs)
+                else:
+                    configs.append(a)
+            # Include the processor itself so its class is exported.
+            configs.append(self)
+            custom_object_save(self, save_directory, config=configs)
+        # Save each sub-component into its own subfolder
         for attribute_name in self.attributes:
             attribute = getattr(self, attribute_name)
+            # Include the processor class in the attribute config so this
+            # processor can then be reloaded with the AutoProcessor API.
             if hasattr(attribute, "_set_processor_class"):
                 # noinspection PyProtectedMember
                 attribute._set_processor_class(self.__class__.__name__)
             attribute.save_pretrained(os.path.join(save_directory, attribute_name))
+        # Clean up temporary auto_map injected into tokenizers, if any
+        if self._auto_class is not None:
+            for attribute_name in self.attributes:
+                attribute = getattr(self, attribute_name)
+                if isinstance(attribute, PreTrainedTokenizerBase) and "auto_map" in attribute.init_kwargs:
+                    del attribute.init_kwargs["auto_map"]
+        # Write processor_config.json (or equivalent)
         output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
+        processor_dict = self.to_dict()
+        # If processor_dict only contains processor_class, we skip writing the file,
+        # matching the upstream behavior; otherwise we save it.
+        if set(processor_dict.keys()) != {"processor_class"}:
+            self.to_json_file(output_processor_file)
+            # noinspection PyUnresolvedReferences
+            logger.warning_once(f"processor saved in {output_processor_file}")
+        # If requested, upload the modified files to the Hub
+        if push_to_hub:
             self._upload_modified_files(
                 save_directory,
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
                 token=kwargs.get("token"),
+                create_pr=kwargs.get("create_pr", False),
+                revision=kwargs.get("revision"),
+                commit_description=kwargs.get("commit_description"),
             )
+        if set(processor_dict.keys()) == {"processor_class"}:
+            return []
         return [output_processor_file]
     @classmethod
         return args
+    def _upload_modified_files(
+        self,
+        working_dir: Union[str, os.PathLike],
+        repo_id: str,
+        files_timestamps: dict[str, float],
+        commit_message: Optional[str] = None,
+        token: Optional[Union[bool, str]] = None,
+        create_pr: bool = False,
+        revision: Optional[str] = None,
+        commit_description: Optional[str] = None,
+    ):
+        """
+        Uploads all modified files in `working_dir` to `repo_id`, based on `files_timestamps`.
+        """
+        working_dir = Path(working_dir)
+        if commit_message is None:
+            commit_message = "Upload CM3P processor"
+        modified_files = [
+            f
+            for f in working_dir.iterdir()
+            if str(f) not in files_timestamps or f.stat().st_mtime > files_timestamps[str(f)]
+        ]
+        # filter for actual files + folders at the root level
+        modified_files = [
+            f
+            for f in modified_files
+            if f.is_file() or f.is_dir()
+        ]
+        operations = []
+        # upload standalone files
+        for file in modified_files:
+            if file.is_dir():
+                # go over individual files of folder
+                for f in file.iterdir():
+                    operations.append(
+                        CommitOperationAdd(
+                            path_or_fileobj=f, path_in_repo=f.relative_to(working_dir).as_posix()
+                        )
+                    )
+            else:
+                operations.append(
+                    CommitOperationAdd(path_or_fileobj=file, path_in_repo=file.relative_to(working_dir).as_posix())
+                )
+        if revision is not None and not revision.startswith("refs/pr"):
+            try:
+                create_branch(repo_id=repo_id, branch=revision, token=token, exist_ok=True)
+            except HfHubHTTPError as e:
+                if e.response.status_code == 403 and create_pr:
+                    # If we are creating a PR on a repo we don't have access to, we can't create the branch.
+                    # so let's assume the branch already exists. If it's not the case, an error will be raised when
+                    # calling `create_commit` below.
+                    pass
+                else:
+                    raise
+        logger.info(f"Uploading the following files to {repo_id}: {','.join([f.relative_to(working_dir).as_posix() for f in modified_files])}")
+        return create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            token=token,
+            create_pr=create_pr,
+            revision=revision,
+        )
 AutoProcessor.register(CM3PConfig, CM3PProcessor)
 __all__ = ["CM3PProcessor", "get_metadata"]

processor_config.json CHANGED Viewed

@@ -1,33 +1,33 @@
-{
-  "default_kwargs": {
-    "audio_kwargs": {
-      "audio_length_per_tok": 8,
-      "hop_length": 160,
-      "max_source_positions": 1600,
-      "pad_to_multiple_of": 256000,
-      "padding": false,
-      "sampling_rate": 16000,
-      "truncation": false,
-      "window_size": 400
-    },
-    "beatmap_kwargs": {
-      "max_length": 2000,
-      "padding": "longest",
-      "truncation": "longest_first",
-      "window_length_sec": 16.0,
-      "window_stride_sec": 16.0
-    },
-    "common_kwargs": {
-      "return_tensors": "pt"
-    },
-    "metadata_kwargs": {
-      "max_length": 128,
-      "padding": "longest",
-      "truncation": "longest_first"
-    }
-  },
-  "processor_class": "CM3PProcessor",
-  "auto_map": {
-    "AutoProcessor": "processing_cm3p.CM3PProcessor"
-  }
-}

+{
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
+  "default_kwargs": {
+    "audio_kwargs": {
+      "audio_length_per_tok": 8,
+      "hop_length": 160,
+      "max_source_positions": 1600,
+      "pad_to_multiple_of": 256000,
+      "padding": false,
+      "sampling_rate": 16000,
+      "truncation": false,
+      "window_size": 400
+    },
+    "beatmap_kwargs": {
+      "max_length": 2000,
+      "padding": "longest",
+      "truncation": "longest_first",
+      "window_length_sec": 16.0,
+      "window_stride_sec": 16.0
+    },
+    "common_kwargs": {
+      "return_tensors": "pt"
+    },
+    "metadata_kwargs": {
+      "max_length": 128,
+      "padding": "longest",
+      "truncation": "longest_first"
+    }
+  },
+  "processor_class": "CM3PProcessor"
+}