File size: 4,426 Bytes
41302cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dd4c49
41302cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dd4c49
41302cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# treinamento.py — V25 — FINE-TUNE AUTOMÁTICO (NA RAIZ)
import json
import os
import threading
import time
import requests
from loguru import logger
from database import Database
from sentence_transformers import SentenceTransformer
import config

# === CONFIGURAÇÃO ===
MODEL_BASE = "qwen2.5:1.5b-instruct-q4_0"
MODEL_FINE = "akira-luanda-v25"
DATASET_PATH = "/app/dataset.jsonl"
MODelfile_PATH = "/app/Modelfile"
EMBEDDING_MODEL = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Lock + dataset
_lock = threading.Lock()
_dataset = []

def gerar_embedding(text: str):
    return EMBEDDING_MODEL.encode(text, convert_to_numpy=True).tolist()

def salvar_dataset():
    with open(DATASET_PATH, "w", encoding="utf-8") as f:
        for entry in _dataset:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

def criar_modelfile():
    modelfile = f"""
FROM {MODEL_BASE}
SYSTEM """ + f'"""{config.PERSONA}"""' + """
PARAMETER temperature 0.9
PARAMETER num_ctx 4096
"""
    with _lock:
        data = _dataset.copy()
    for d in data:
        modelfile += f"\nUSER: {d['user']}\nASSISTANT: {d['assistant']}\n"
    return modelfile

class Treinamento:
    def __init__(self, db: Database, min_interactions: int = 25, interval_hours: int = 4):
        self.db = db
        self.min_interactions = min_interactions
        self.interval = interval_hours * 3600
        self.thread = None
        self.carregar_dataset()
        self.iniciar_loop()

    def carregar_dataset(self):
        global _dataset
        if os.path.exists(DATASET_PATH):
            try:
                with open(DATASET_PATH, "r", encoding="utf-8") as f:
                    _dataset = [json.loads(l) for l in f if l.strip()]
                logger.info(f"{len(_dataset)} kandandos carregados do dataset!")
            except Exception as e:
                logger.error(f"Erro ao carregar dataset: {e}")
                _dataset = []

    def iniciar_loop(self):
        if not self.thread or not self.thread.is_alive():
            self.thread = threading.Thread(target=self._loop, daemon=True)
            self.thread.start()
            logger.info("Loop de fine-tune iniciado!")

    def registrar_interacao(self, usuario, mensagem, resposta, numero, is_reply=False, mensagem_original=""):
        try:
            # === SALVA NO BANCO ===
            self.db.salvar_mensagem(usuario, mensagem, resposta, numero)

            # === EMBEDDING ===
            texto = f"{mensagem} {resposta}".lower()
            embedding = gerar_embedding(texto)
            self.db.salvar_embedding(numero, mensagem, resposta, embedding, texto=texto)

            # === DATASET ===
            entry = {"user": mensagem.strip(), "assistant": resposta.strip()}
            with _lock:
                _dataset.append(entry)
            with open(DATASET_PATH, "a", encoding="utf-8") as f:
                json.dump(entry, f, ensure_ascii=False)
                f.write("\n")

            logger.info(f"Kandando salvo: {len(_dataset)} total")

            # === TREINA SE CHEGAR A 25 ===
            if len(_dataset) >= self.min_interactions:
                threading.Thread(target=self._treinar, daemon=True).start()

        except Exception as e:
            logger.error(f"Erro ao registrar: {e}")

    def _treinar(self):
        if len(_dataset) < self.min_interactions:
            return
        logger.info(f"INICIANDO FINE-TUNE → {MODEL_FINE} com {len(_dataset)} kandandos")

        try:
            salvar_dataset()
            modelfile = criar_modelfile()
            with open(MODelfile_PATH, "w", encoding="utf-8") as f:
                f.write(modelfile)

            files = {'modelfile': open(MODelfile_PATH, 'rb')}
            data = {'name': MODEL_FINE}
            resp = requests.post("http://localhost:11434/api/create", files=files, data=data, timeout=600)

            if resp.status_code == 200:
                config.OLLAMA_MODEL = MODEL_FINE
                logger.success(f"MODELO {MODEL_FINE} CRIADO COM SUCESSO!")
            else:
                logger.error(f"Erro Ollama: {resp.status_code} {resp.text}")

            os.remove(MODelfile_PATH)
        except Exception as e:
            logger.error(f"Erro no fine-tune: {e}")

    def _loop(self):
        while True:
            time.sleep(self.interval)
            if len(_dataset) >= self.min_interactions:
                self._treinar()