LLAMA_520M_CONFIG_DICT = dict( # Arbitrary small number that won't cause problems when loading. # These param are unused due to custom input layers. vocab_size=8, # default params needed for loading most pretrained 1B weights max_position_embeddings=131072, hidden_size=1024, intermediate_size=4096, num_hidden_layers=30, num_attention_heads=16, attn_implementation="sdpa", head_dim=64, tie_word_embeddings=False, hidden_act="silu", attention_bias=False, attention_dropout=0.0, initializer_range=0.02, mlp_bias=False, model_type="llama", num_key_value_heads=16, pretraining_tp=1, rms_norm_eps=1e-05, rope_scaling=dict( factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3" ), rope_theta=500000.0, torch_dtype="bfloat16", use_cache=True, ) GPT2_MEDIUM_CONFIG = { "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 8196, "n_embd": 1024, "hidden_size": 1024, "n_head": 16, "n_layer": 24, "n_positions": 8196, "n_special": 0, "predict_special_tokens": True, "resid_pdrop": 0.1, "summary_activation": None, "summary_first_dropout": 0.1, "summary_proj_to_labels": True, "summary_type": "cls_index", "summary_use_proj": True, "task_specific_params": { "text-generation": { "do_sample": True, "max_length": 50 } }, "vocab_size": 50276, } LLAMA_CONFIGS = { "Llama_520M": LLAMA_520M_CONFIG_DICT, "GPT2_medium": GPT2_MEDIUM_CONFIG, }