Text Generation
Transformers
mistral
Mistral_Star
Mistral_Quiet
Mistral
Mixtral
Question-Answer
Token-Classification
Sequence-Classification
SpydazWeb-AI
chemistry
biology
legal
code
climate
medical
text-generation-inference
custom_code
Instructions to use LeroyDyer/QuietStar_Project with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use LeroyDyer/QuietStar_Project with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="LeroyDyer/QuietStar_Project", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("LeroyDyer/QuietStar_Project", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("LeroyDyer/QuietStar_Project", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use LeroyDyer/QuietStar_Project with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "LeroyDyer/QuietStar_Project" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LeroyDyer/QuietStar_Project", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/LeroyDyer/QuietStar_Project
- SGLang
How to use LeroyDyer/QuietStar_Project with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "LeroyDyer/QuietStar_Project" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LeroyDyer/QuietStar_Project", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "LeroyDyer/QuietStar_Project" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LeroyDyer/QuietStar_Project", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use LeroyDyer/QuietStar_Project with Docker Model Runner:
docker model run hf.co/LeroyDyer/QuietStar_Project
| ## CREATE MODEL FROM SCRATCH | |
| ## TOBE REMOVED | |
| # pip install reportlab | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig,AutoConfig | |
| import time | |
| import torch | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| import random | |
| from datasets import load_dataset | |
| from transformers import TrainingArguments | |
| from trl import SFTTrainer | |
| from peft import LoraConfig | |
| # from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model | |
| from torch.nn import CrossEntropyLoss | |
| torch.autograd.set_detect_anomaly(True) | |
| random_seed = 42 | |
| torch.manual_seed(random_seed) | |
| random.seed(random_seed) | |
| # Set the device for each process | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # torch.cuda.set_device(device) | |
| n_ahead_talk_global = 4 | |
| n_passes_global = 2 | |
| n_ahead_global = 8 | |
| n_examples = 0 | |
| def model_init(params): | |
| original = False | |
| if params is None: | |
| params = {} | |
| else: | |
| params = params.params | |
| # save params to file | |
| n_ahead = params.get("n_ahead", n_ahead_global if not original else 1) | |
| n_ahead_talk = params.get("n_ahead_talk", n_ahead_talk_global if not original else 1) | |
| n_passes = params.get("n_passes", n_passes_global if not original else 1) | |
| gumbel_temperature = params.get("gumbel_temperature", 1) | |
| use_start_thought_token = params.get("use_start_thought_token", True) | |
| use_end_thought_token = params.get("use_end_thought_token", True) | |
| include_policy_loss = params.get("include_policy_loss", True) | |
| gumbel_detach = params.get("gumbel_detach", True) | |
| merged_talk_heads = params.get("merged_talk_heads", True) | |
| residual_think_head = params.get("residual_think_head", False) | |
| optimize_lm_head_only_at_start = params.get("optimize_lm_head_only_at_start", False) | |
| model_id = "LeroyDyer/_Spydaz_Web_AI_V2_Aligned" | |
| tokenizer_id = model_id | |
| print("Loading model") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| max_thoughts=n_ahead + n_ahead_talk + 1, | |
| merged_talk_heads=merged_talk_heads, | |
| merged_lm_and_talk_heads=False, | |
| merged_lm_and_think_heads=True, | |
| use_concat_talk_head=True, | |
| use_shallow_think=True, | |
| use_shallow_talk=False, | |
| use_complex_think_head=False, | |
| use_complex_talk_head=True, | |
| use_weighted_talk_head=True, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| ) | |
| print("Loaded model") | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, truncation=True, padding_side="right") | |
| tokenizer.pad_token_id = tokenizer.eos_token_id | |
| special_tokens_to_add = [] | |
| if model.use_start_thought_token: | |
| special_tokens_to_add.append("<|startthought|>") | |
| if model.use_end_thought_token: | |
| special_tokens_to_add.append("<|endthought|>") | |
| if special_tokens_to_add: | |
| tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add}) | |
| model.resize_token_embeddings(len(tokenizer)) | |
| model.tokenizer = tokenizer | |
| for name, module in model.named_modules(): | |
| if "embed" in name: | |
| print(module, flush=True) | |
| model.gumbel_detach = gumbel_detach | |
| model.include_policy_loss = include_policy_loss | |
| model.use_end_thought_token = use_end_thought_token | |
| model.use_start_thought_token = use_start_thought_token | |
| model.n_ahead = n_ahead | |
| model.n_ahead_talk = n_ahead_talk | |
| model.n_passes = n_passes | |
| model.residual_think_head = residual_think_head | |
| model.optimize_lm_head_only_at_start = optimize_lm_head_only_at_start | |
| model.gumbel_temperature = gumbel_temperature | |
| model.original_mode = original | |
| model.config_params = params | |
| return model,tokenizer | |
| model,tokenizer = model_init(None) | |
| model | |
| tokenizer.save_pretrained("IModel") | |
| model.save_pretrained("IModel") | |
| import os | |
| import huggingface_hub | |
| from huggingface_hub import notebook_login | |
| from huggingface_hub import create_repo, HfApi | |
| from huggingface_hub import hf_hub_download | |
| from huggingface_hub import create_repo, HfApi | |
| from huggingface_hub import snapshot_download | |
| WRITE_TOKEN="" | |
| username = "LeroyDyer" | |
| huggingface_hub.login(WRITE_TOKEN) | |
| api = HfApi(token=WRITE_TOKEN) | |
| MODEL_NAME = "_Spydaz_Web_AI_MistralStar" | |
| Folderinput = "IModel" | |
| # Create empty repo | |
| api.create_repo( | |
| repo_id = f"{username}/{MODEL_NAME}", | |
| repo_type="model", | |
| exist_ok=True, | |
| ) | |
| api.upload_folder( | |
| repo_id = f"{username}/{MODEL_NAME}", | |
| folder_path = Folderinput | |
| ) | |
| import huggingface_hub | |
| from trl import SFTTrainer | |
| from transformers import TrainingArguments | |
| from datasets import load_dataset | |
| from unsloth import FastLanguageModel | |
| import torch | |
| WRITE_TOKEN = "" | |
| username = "LeroyDyer" | |
| huggingface_hub.login(WRITE_TOKEN) | |
| MODEL_ID = "LeroyDyer/_Spydaz_Web_AI_MistralStar" | |
| max_seq_length = 1512 # Choose any! We auto support RoPE Scaling internally! | |
| dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ | |
| load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name = MODEL_ID, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B | |
| max_seq_length = max_seq_length, | |
| dtype = dtype, | |
| load_in_4bit = load_in_4bit, | |
| #token = "", # use one if using gated models like meta-llama/Llama-2-7b-hf | |
| ) | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 | |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj"], | |
| lora_alpha = 64, | |
| lora_dropout = 0, # Supports any, but = 0 is optimized | |
| bias = "none", # Supports any, but = "none" is optimized | |
| # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! | |
| use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context | |
| random_state = 644993, | |
| use_rslora = False, # We support rank stabilized LoRA | |
| loftq_config = None, # And LoftQ | |
| ) | |
| alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {} | |
| ### Input: | |
| {} | |
| ### Response: | |
| {}""" | |
| EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN | |
| def formatting_prompts_func(examples): | |
| instructions = examples["instruction"] | |
| inputs = examples["input"] | |
| outputs = examples["output"] | |
| texts = [] | |
| for instruction, input, output in zip(instructions, inputs, outputs): | |
| # Must add EOS_TOKEN, otherwise your generation will go on forever! | |
| text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN | |
| texts.append(text) | |
| return { "text" : texts, } | |
| pass | |
| from datasets import load_dataset | |
| dataset = load_dataset("gate369/Alpaca-Star", split = "train[:1000]") | |
| dataset = dataset.shuffle(seed=9969) | |
| dataset = dataset.map(formatting_prompts_func, batched = True,) | |
| from trl import SFTTrainer | |
| from transformers import TrainingArguments | |
| from unsloth import is_bfloat16_supported | |
| from unsloth import UnslothTrainer, UnslothTrainingArguments | |
| trainer = UnslothTrainer( | |
| model = model, | |
| tokenizer = tokenizer, | |
| train_dataset = dataset, | |
| dataset_text_field = "text", | |
| max_seq_length = max_seq_length, | |
| dataset_num_proc = 8, | |
| args = UnslothTrainingArguments( | |
| per_device_train_batch_size = 10, | |
| gradient_accumulation_steps = 8, | |
| warmup_ratio = 0.1, | |
| num_train_epochs = 2, | |
| learning_rate = 2e-4, | |
| embedding_learning_rate = 2e-5, | |
| output_dir = "outputs", | |
| save_strategy = "steps", | |
| save_steps = 50, | |
| fp16 = not is_bfloat16_supported(), | |
| bf16 = is_bfloat16_supported(), | |
| logging_steps = 1, | |
| optim = "adamw_8bit", | |
| weight_decay = 0.00, | |
| lr_scheduler_type = "cosine", | |
| seed = 3607, | |
| ), | |
| ) | |
| trainer_stats = trainer.train() | |
| # Merge to 16bit | |
| if False: model.save_pretrained_merged("LCARS_AI_015", tokenizer, save_method = "merged_16bit",) | |
| if True: model.push_to_hub_merged("_Spydaz_Web_AI_STAR_Aligned", tokenizer, save_method = "merged_16bit", token = "") | |
| # Merge to 4bit | |
| if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit_forced",) | |
| if True: model.push_to_hub_merged("_Spydaz_Web_AI_STAR_Aligned_4_BIT", tokenizer, save_method = "merged_4bit_forced", token = "") | |
| # Just LoRA adapters | |
| if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",) | |
| if False: model.push_to_hub_merged("Test_Lora", tokenizer, save_method = "lora", token = "") | |