| import os |
| import requests |
| import numpy as np |
| import pickle |
| from datasets import load_dataset |
| from tqdm import tqdm |
|
|
| |
| DATA_CACHE_DIR = os.path.dirname(__file__) |
| SEPARATOR = "\n\n<|END_OF_DOCUMENT|>\n\n" |
| TARGET_SIZE_PER_DOMAIN = 4_000_000 |
|
|
| def get_python_data(target_chars): |
| print("🔹 Baixando Código Python (The Stack)...") |
| |
| dataset = load_dataset("bigcode/the-stack-smol", data_dir="data/python", split="train", streaming=True) |
| |
| text_accum = [] |
| current_len = 0 |
| |
| for sample in tqdm(dataset, desc="Coletando Python", total=target_chars//100): |
| code = sample['content'] |
| |
| if 100 < len(code) < 10000: |
| text_accum.append(code) |
| current_len += len(code) |
| if current_len >= target_chars: break |
| |
| return SEPARATOR.join(text_accum) |
|
|
| def get_math_data(target_chars): |
| print("🔹 Baixando Matemática (DeepMind Algebra)...") |
| |
| dataset = load_dataset("math_dataset", "algebra__linear_1d", split="train", streaming=True) |
| |
| text_accum = [] |
| current_len = 0 |
| |
| for sample in tqdm(dataset, desc="Coletando Math"): |
| |
| qa = f"Q: {sample['question'].strip()}\nA: {sample['answer'].strip()}" |
| text_accum.append(qa) |
| current_len += len(qa) |
| if current_len >= target_chars: break |
| |
| return SEPARATOR.join(text_accum) |
|
|
| def get_tinystories_data(target_chars): |
| print("🔹 Baixando TinyStories (Raciocínio Narrativo)...") |
| |
| dataset = load_dataset("roneneldan/TinyStories", split="train", streaming=True) |
| |
| text_accum = [] |
| current_len = 0 |
| |
| for sample in tqdm(dataset, desc="Coletando Stories"): |
| story = sample['text'] |
| text_accum.append(story) |
| current_len += len(story) |
| if current_len >= target_chars: break |
| |
| return SEPARATOR.join(text_accum) |
|
|
| def get_classic_lit_data(): |
| print("🔹 Baixando Literatura (War and Peace)...") |
| url = 'https://raw.githubusercontent.com/mmcky/nyu-econ-370/master/notebooks/data/book-war-and-peace.txt' |
| try: |
| text = requests.get(url).text |
| return text |
| except: |
| print("Erro ao baixar livro. Pulando...") |
| return "" |
|
|
| def prepare_super_dataset(): |
| print(f"--- 🧠 PREPARING MULTI-DOMAIN DATASET ({TARGET_SIZE_PER_DOMAIN/1e6 * 4} MB target) ---") |
| |
| |
| |
| parts = [] |
| |
| try: parts.append(get_python_data(TARGET_SIZE_PER_DOMAIN)) |
| except Exception as e: print(f"Erro no Python: {e}") |
| |
| try: parts.append(get_math_data(TARGET_SIZE_PER_DOMAIN)) |
| except Exception as e: print(f"Erro no Math: {e}") |
| |
| try: parts.append(get_tinystories_data(TARGET_SIZE_PER_DOMAIN)) |
| except Exception as e: print(f"Erro no TinyStories: {e}") |
| |
| try: parts.append(get_classic_lit_data()) |
| except Exception as e: print(f"Erro no Livro: {e}") |
|
|
| |
| print("\nCombinando tudo...") |
| full_text = SEPARATOR.join(parts) |
| |
| print(f"\n📊 ESTATÍSTICAS FINAIS:") |
| print(f"Total Caracteres: {len(full_text):,}") |
| print(f"Tamanho em Disco: {len(full_text)/1024/1024:.2f} MB") |
|
|
| |
| |
| print("Construindo vocabulário...") |
| chars = sorted(list(set(full_text))) |
| vocab_size = len(chars) |
| print(f"Vocab Size: {vocab_size}") |
| print(f"Chars (Amostra): {''.join(chars[30:80])}...") |
|
|
| stoi = { ch:i for i,ch in enumerate(chars) } |
| itos = { i:ch for i,ch in enumerate(chars) } |
| |
| meta = { |
| 'vocab_size': vocab_size, |
| 'stoi': stoi, |
| 'itos': itos, |
| } |
| with open(os.path.join(DATA_CACHE_DIR, 'meta.pkl'), 'wb') as f: |
| pickle.dump(meta, f) |
|
|
| def encode(s): |
| return [stoi[c] for c in s] |
|
|
| |
| print("Codificando e salvando (Isso pode levar um minuto)...") |
| n = len(full_text) |
| split_idx = int(n * 0.9) |
| |
| train_data = full_text[:split_idx] |
| val_data = full_text[split_idx:] |
| |
| |
| train_ids = np.array(encode(train_data), dtype=np.uint16) |
| val_ids = np.array(encode(val_data), dtype=np.uint16) |
| |
| train_ids.tofile(os.path.join(DATA_CACHE_DIR, 'train.bin')) |
| val_ids.tofile(os.path.join(DATA_CACHE_DIR, 'val.bin')) |
| |
| print(f"✅ PRONTO! Arquivos salvos em {DATA_CACHE_DIR}") |
| print("Agora rode: python train.py") |
|
|
| if __name__ == '__main__': |
| prepare_super_dataset() |