Upload folder using huggingface_hub
Browse files- nanochat_tokenizer.py +12 -2
nanochat_tokenizer.py
CHANGED
|
@@ -32,16 +32,26 @@ class NanochatTokenizer(PreTrainedTokenizer):
|
|
| 32 |
if os.path.isfile(local_file):
|
| 33 |
vocab_file = local_file
|
| 34 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
# Download from HF Hub
|
| 36 |
try:
|
| 37 |
vocab_file = hf_hub_download(
|
| 38 |
-
repo_id=
|
| 39 |
filename="tokenizer.pkl",
|
| 40 |
repo_type="model"
|
| 41 |
)
|
| 42 |
except Exception as e:
|
| 43 |
raise ValueError(
|
| 44 |
-
f"Could not find or download tokenizer.pkl for {
|
| 45 |
)
|
| 46 |
|
| 47 |
if vocab_file is None or not os.path.isfile(vocab_file):
|
|
|
|
| 32 |
if os.path.isfile(local_file):
|
| 33 |
vocab_file = local_file
|
| 34 |
else:
|
| 35 |
+
# Extract repo ID from cache path if needed
|
| 36 |
+
repo_id = self.name_or_path
|
| 37 |
+
if "models--" in str(repo_id):
|
| 38 |
+
# Cache path format: .../models--namespace--repo_name/snapshots/...
|
| 39 |
+
parts = str(repo_id).split("models--")
|
| 40 |
+
if len(parts) > 1:
|
| 41 |
+
# Get the models--namespace--repo_name part
|
| 42 |
+
repo_part = parts[1].split("/")[0]
|
| 43 |
+
repo_id = repo_part.replace("--", "/")
|
| 44 |
+
|
| 45 |
# Download from HF Hub
|
| 46 |
try:
|
| 47 |
vocab_file = hf_hub_download(
|
| 48 |
+
repo_id=repo_id,
|
| 49 |
filename="tokenizer.pkl",
|
| 50 |
repo_type="model"
|
| 51 |
)
|
| 52 |
except Exception as e:
|
| 53 |
raise ValueError(
|
| 54 |
+
f"Could not find or download tokenizer.pkl for {repo_id}: {e}"
|
| 55 |
)
|
| 56 |
|
| 57 |
if vocab_file is None or not os.path.isfile(vocab_file):
|