| """ | |
| Check NAP consistency: compare each row to reference NAP and set match flags. | |
| """ | |
| import argparse | |
| from pathlib import Path | |
| import pandas as pd | |
| from config import DATA_PATH, REFERENCE_NAP_PATH, OUTPUT_DIR, NAP_COLUMNS, SOURCE_COLUMN | |
| from normalize import normalize_nap | |
| def load_reference(path: str) -> dict: | |
| df = pd.read_csv(path) | |
| row = df.iloc[0] | |
| return normalize_nap( | |
| row.get("name"), | |
| row.get("address"), | |
| row.get("phone"), | |
| ) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--input", default=DATA_PATH) | |
| parser.add_argument("--reference", default=REFERENCE_NAP_PATH) | |
| parser.add_argument("--output", default=None) | |
| args = parser.parse_args() | |
| args.output = args.output or str(OUTPUT_DIR / "consistency_report.csv") | |
| if not Path(args.reference).exists(): | |
| raise FileNotFoundError(f"Reference NAP not found: {args.reference}. Create data/reference_nap.csv with name, address, phone.") | |
| if not Path(args.input).exists(): | |
| raise FileNotFoundError(f"Input not found: {args.input}. Create data/citations.csv with name, address, phone, source.") | |
| ref = load_reference(args.reference) | |
| df = pd.read_csv(args.input) | |
| for c in NAP_COLUMNS: | |
| if c not in df.columns: | |
| raise ValueError(f"CSV must have columns: {NAP_COLUMNS}") | |
| rows = [] | |
| for _, row in df.iterrows(): | |
| nap = normalize_nap(row.get("name"), row.get("address"), row.get("phone")) | |
| name_ok = nap["name"] == ref["name"] | |
| address_ok = nap["address"] == ref["address"] | |
| phone_ok = nap["phone"] == ref["phone"] | |
| rows.append({ | |
| SOURCE_COLUMN: row.get(SOURCE_COLUMN, ""), | |
| "name_match": name_ok, | |
| "address_match": address_ok, | |
| "phone_match": phone_ok, | |
| "nap_consistent": name_ok and address_ok and phone_ok, | |
| }) | |
| out = pd.DataFrame(rows) | |
| Path(args.output).parent.mkdir(parents=True, exist_ok=True) | |
| out.to_csv(args.output, index=False) | |
| print(f"Saved to {args.output}") | |
| if __name__ == "__main__": | |
| main() | |