""" Assign cluster labels to keywords (CSV or list). """ import argparse from pathlib import Path import pandas as pd import joblib from config import MODEL_DIR, KEYWORD_COLUMN def main(): parser = argparse.ArgumentParser() parser.add_argument("--input", default="data/keywords.csv", help="CSV with keyword column") parser.add_argument("--output", default="data/clustered.csv", help="Output CSV with cluster column") args = parser.parse_args() if not Path(args.input).exists(): raise FileNotFoundError(f"Input file not found: {args.input}. Run from project root or use an absolute path.") if not (MODEL_DIR / "encoder.joblib").exists(): raise FileNotFoundError(f"Run train.py first. No model in {MODEL_DIR}") encoder = joblib.load(MODEL_DIR / "encoder.joblib") clusterer = joblib.load(MODEL_DIR / "clusterer.joblib") df = pd.read_csv(args.input) if KEYWORD_COLUMN not in df.columns: raise ValueError(f"CSV must have column: {KEYWORD_COLUMN}") X = encoder.encode(df[KEYWORD_COLUMN].astype(str).tolist()) df["cluster"] = clusterer.predict(X) Path(args.output).parent.mkdir(parents=True, exist_ok=True) df.to_csv(args.output, index=False) print(f"Saved to {args.output}") if __name__ == "__main__": main()