nap-consistency-checker / inference.py
Alireza Aminzadeh
Upload folder using huggingface_hub
7c2379e verified
"""
Check NAP consistency: compare each row to reference NAP and set match flags.
"""
import argparse
from pathlib import Path
import pandas as pd
from config import DATA_PATH, REFERENCE_NAP_PATH, OUTPUT_DIR, NAP_COLUMNS, SOURCE_COLUMN
from normalize import normalize_nap
def load_reference(path: str) -> dict:
df = pd.read_csv(path)
row = df.iloc[0]
return normalize_nap(
row.get("name"),
row.get("address"),
row.get("phone"),
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", default=DATA_PATH)
parser.add_argument("--reference", default=REFERENCE_NAP_PATH)
parser.add_argument("--output", default=None)
args = parser.parse_args()
args.output = args.output or str(OUTPUT_DIR / "consistency_report.csv")
if not Path(args.reference).exists():
raise FileNotFoundError(f"Reference NAP not found: {args.reference}. Create data/reference_nap.csv with name, address, phone.")
if not Path(args.input).exists():
raise FileNotFoundError(f"Input not found: {args.input}. Create data/citations.csv with name, address, phone, source.")
ref = load_reference(args.reference)
df = pd.read_csv(args.input)
for c in NAP_COLUMNS:
if c not in df.columns:
raise ValueError(f"CSV must have columns: {NAP_COLUMNS}")
rows = []
for _, row in df.iterrows():
nap = normalize_nap(row.get("name"), row.get("address"), row.get("phone"))
name_ok = nap["name"] == ref["name"]
address_ok = nap["address"] == ref["address"]
phone_ok = nap["phone"] == ref["phone"]
rows.append({
SOURCE_COLUMN: row.get(SOURCE_COLUMN, ""),
"name_match": name_ok,
"address_match": address_ok,
"phone_match": phone_ok,
"nap_consistent": name_ok and address_ok and phone_ok,
})
out = pd.DataFrame(rows)
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(args.output, index=False)
print(f"Saved to {args.output}")
if __name__ == "__main__":
main()