| LLAMA_520M_CONFIG_DICT = dict( |
| |
| |
| vocab_size=8, |
| |
| max_position_embeddings=131072, |
| hidden_size=1024, |
| intermediate_size=4096, |
| num_hidden_layers=30, |
| num_attention_heads=16, |
| attn_implementation="sdpa", |
| head_dim=64, |
| tie_word_embeddings=False, |
| hidden_act="silu", |
| attention_bias=False, |
| attention_dropout=0.0, |
| initializer_range=0.02, |
| mlp_bias=False, |
| model_type="llama", |
| num_key_value_heads=16, |
| pretraining_tp=1, |
| rms_norm_eps=1e-05, |
| rope_scaling=dict( |
| factor=8.0, |
| high_freq_factor=4.0, |
| low_freq_factor=1.0, |
| original_max_position_embeddings=8192, |
| rope_type="llama3" |
| ), |
| rope_theta=500000.0, |
| torch_dtype="bfloat16", |
| use_cache=True, |
| ) |
|
|
| GPT2_MEDIUM_CONFIG = { |
| "activation_function": "gelu_new", |
| "architectures": [ |
| "GPT2LMHeadModel" |
| ], |
| "attn_pdrop": 0.1, |
| "bos_token_id": 50256, |
| "embd_pdrop": 0.1, |
| "eos_token_id": 50256, |
| "initializer_range": 0.02, |
| "layer_norm_epsilon": 1e-05, |
| "model_type": "gpt2", |
| "n_ctx": 8196, |
| "n_embd": 1024, |
| "hidden_size": 1024, |
| "n_head": 16, |
| "n_layer": 24, |
| "n_positions": 8196, |
| "n_special": 0, |
| "predict_special_tokens": True, |
| "resid_pdrop": 0.1, |
| "summary_activation": None, |
| "summary_first_dropout": 0.1, |
| "summary_proj_to_labels": True, |
| "summary_type": "cls_index", |
| "summary_use_proj": True, |
| "task_specific_params": { |
| "text-generation": { |
| "do_sample": True, |
| "max_length": 50 |
| } |
| }, |
| "vocab_size": 50276, |
| } |
|
|
| LLAMA_CONFIGS = { |
| "Llama_520M": LLAMA_520M_CONFIG_DICT, |
| "GPT2_medium": GPT2_MEDIUM_CONFIG, |
| } |
|
|