| """ | |
| Score backlinks: predicted quality and/or risk. | |
| """ | |
| import argparse | |
| from pathlib import Path | |
| import pandas as pd | |
| import joblib | |
| from config import MODEL_DIR, FEATURE_COLUMNS, TARGET_QUALITY, TARGET_RISK | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--input", default="data/backlinks.csv") | |
| parser.add_argument("--output", default="scored_links.csv") | |
| args = parser.parse_args() | |
| if not Path(args.input).exists(): | |
| raise FileNotFoundError(f"Input file not found: {args.input}. Run from project root or use an absolute path.") | |
| df = pd.read_csv(args.input) | |
| if "anchor_text" in df.columns and "anchor_length" not in df.columns: | |
| df["anchor_length"] = df["anchor_text"].fillna("").str.len() | |
| features = joblib.load(MODEL_DIR / "feature_columns.joblib") if (MODEL_DIR / "feature_columns.joblib").exists() else [c for c in FEATURE_COLUMNS if c in df.columns] | |
| if not features: | |
| raise ValueError(f"Input must contain at least one of {FEATURE_COLUMNS}") | |
| X = df[[c for c in features if c in df.columns]].fillna(0) | |
| out = df.copy() | |
| if (MODEL_DIR / "quality_model.joblib").exists(): | |
| model = joblib.load(MODEL_DIR / "quality_model.joblib") | |
| out[f"pred_{TARGET_QUALITY}"] = model.predict(X) | |
| if (MODEL_DIR / "risk_model.joblib").exists(): | |
| model = joblib.load(MODEL_DIR / "risk_model.joblib") | |
| out["pred_risk_label"] = model.predict(X) | |
| Path(args.output).parent.mkdir(parents=True, exist_ok=True) | |
| out.to_csv(args.output, index=False) | |
| print(f"Saved to {args.output}") | |
| if __name__ == "__main__": | |
| main() | |