| { | |
| "model_type": "distilbert", | |
| "architectures": [ | |
| "Hecto" | |
| ], | |
| "hidden_size": 768, | |
| "num_labels": 4, | |
| "id2label": { | |
| "0": "World", | |
| "1": "Sports", | |
| "2": "Business", | |
| "3": "Sci/Tech" | |
| }, | |
| "label2id": { | |
| "World": 0, | |
| "Sports": 1, | |
| "Business": 2, | |
| "Sci/Tech": 3 | |
| }, | |
| "moe_type": "heterogeneous", | |
| "experts": { | |
| "expert_0": { | |
| "type": "ffnn", | |
| "layers": [ | |
| 256, | |
| 128 | |
| ], | |
| "activation": "tanh" | |
| }, | |
| "expert_1": { | |
| "type": "gru", | |
| "input_dim": 256, | |
| "hidden_dim": 128, | |
| "bidirectional": false | |
| } | |
| }, | |
| "gating": { | |
| "type": "top1", | |
| "temperature": 1.5, | |
| "mlp_dims": [ | |
| 256, | |
| 128, | |
| 2 | |
| ], | |
| "regularization": { | |
| "entropy_loss": true, | |
| "load_balancing": true | |
| } | |
| }, | |
| "encoder": { | |
| "base_model": "distilbert-base-uncased", | |
| "freeze_encoder": false | |
| }, | |
| "transformers_version": "4.41.1" | |
| } |