eeat-scorer / train.py
Alireza Aminzadeh
Upload folder using huggingface_hub
499e0b9 verified
"""
Train E-E-A-T signal regressors from content embeddings and optional link features.
"""
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sentence_transformers import SentenceTransformer
import xgboost as xgb
import joblib
from config import (
DATA_PATH,
MODEL_DIR,
RANDOM_STATE,
CONTENT_COLUMN,
TARGET_COLUMNS,
OPTIONAL_FEATURES,
)
def main():
if not Path(DATA_PATH).exists():
print(f"Data not found at {DATA_PATH}. Create data/eeat_labels.csv with: {CONTENT_COLUMN}, {TARGET_COLUMNS}")
return
df = pd.read_csv(DATA_PATH)
if CONTENT_COLUMN not in df.columns:
raise ValueError(f"Missing column: {CONTENT_COLUMN}")
targets = [c for c in TARGET_COLUMNS if c in df.columns]
if not targets:
raise ValueError(f"Need at least one of {TARGET_COLUMNS}")
encoder = SentenceTransformer("all-MiniLM-L6-v2")
content_emb = encoder.encode(df[CONTENT_COLUMN].fillna("").astype(str).tolist())
extra = [c for c in OPTIONAL_FEATURES if c in df.columns]
if extra:
X = np.hstack([content_emb, df[extra].fillna(0).values])
else:
X = content_emb
X = np.asarray(X)
metrics = {}
for t in targets:
y = df[t].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
model = xgb.XGBRegressor(random_state=RANDOM_STATE, n_estimators=100)
model.fit(X_train, y_train)
pred = model.predict(X_val)
metrics[t] = {"mae": float(mean_absolute_error(y_val, pred)), "r2": float(r2_score(y_val, pred))}
joblib.dump(model, MODEL_DIR / f"eeat_{t}.joblib")
joblib.dump(encoder, MODEL_DIR / "encoder.joblib")
joblib.dump(extra, MODEL_DIR / "extra_features.joblib")
with open(MODEL_DIR / "metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
print("Metrics:", metrics)
if __name__ == "__main__":
main()