| """ | |
| Assign cluster labels to keywords (CSV or list). | |
| """ | |
| import argparse | |
| from pathlib import Path | |
| import pandas as pd | |
| import joblib | |
| from config import MODEL_DIR, KEYWORD_COLUMN | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--input", default="data/keywords.csv", help="CSV with keyword column") | |
| parser.add_argument("--output", default="data/clustered.csv", help="Output CSV with cluster column") | |
| args = parser.parse_args() | |
| if not Path(args.input).exists(): | |
| raise FileNotFoundError(f"Input file not found: {args.input}. Run from project root or use an absolute path.") | |
| if not (MODEL_DIR / "encoder.joblib").exists(): | |
| raise FileNotFoundError(f"Run train.py first. No model in {MODEL_DIR}") | |
| encoder = joblib.load(MODEL_DIR / "encoder.joblib") | |
| clusterer = joblib.load(MODEL_DIR / "clusterer.joblib") | |
| df = pd.read_csv(args.input) | |
| if KEYWORD_COLUMN not in df.columns: | |
| raise ValueError(f"CSV must have column: {KEYWORD_COLUMN}") | |
| X = encoder.encode(df[KEYWORD_COLUMN].astype(str).tolist()) | |
| df["cluster"] = clusterer.predict(X) | |
| Path(args.output).parent.mkdir(parents=True, exist_ok=True) | |
| df.to_csv(args.output, index=False) | |
| print(f"Saved to {args.output}") | |
| if __name__ == "__main__": | |
| main() | |