""" Check NAP consistency: compare each row to reference NAP and set match flags. """ import argparse from pathlib import Path import pandas as pd from config import DATA_PATH, REFERENCE_NAP_PATH, OUTPUT_DIR, NAP_COLUMNS, SOURCE_COLUMN from normalize import normalize_nap def load_reference(path: str) -> dict: df = pd.read_csv(path) row = df.iloc[0] return normalize_nap( row.get("name"), row.get("address"), row.get("phone"), ) def main(): parser = argparse.ArgumentParser() parser.add_argument("--input", default=DATA_PATH) parser.add_argument("--reference", default=REFERENCE_NAP_PATH) parser.add_argument("--output", default=None) args = parser.parse_args() args.output = args.output or str(OUTPUT_DIR / "consistency_report.csv") if not Path(args.reference).exists(): raise FileNotFoundError(f"Reference NAP not found: {args.reference}. Create data/reference_nap.csv with name, address, phone.") if not Path(args.input).exists(): raise FileNotFoundError(f"Input not found: {args.input}. Create data/citations.csv with name, address, phone, source.") ref = load_reference(args.reference) df = pd.read_csv(args.input) for c in NAP_COLUMNS: if c not in df.columns: raise ValueError(f"CSV must have columns: {NAP_COLUMNS}") rows = [] for _, row in df.iterrows(): nap = normalize_nap(row.get("name"), row.get("address"), row.get("phone")) name_ok = nap["name"] == ref["name"] address_ok = nap["address"] == ref["address"] phone_ok = nap["phone"] == ref["phone"] rows.append({ SOURCE_COLUMN: row.get(SOURCE_COLUMN, ""), "name_match": name_ok, "address_match": address_ok, "phone_match": phone_ok, "nap_consistent": name_ok and address_ok and phone_ok, }) out = pd.DataFrame(rows) Path(args.output).parent.mkdir(parents=True, exist_ok=True) out.to_csv(args.output, index=False) print(f"Saved to {args.output}") if __name__ == "__main__": main()