Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.env.example +4 -0
README.md +67 -0
config.py +21 -0
data/.gitkeep +0 -0
data/crawl_export.csv +22 -0
inference.py +38 -0
models/.gitkeep +0 -0
requirements.txt +8 -0
train.py +55 -0

.env.example ADDED Viewed

	@@ -0,0 +1,4 @@

+# Technical-SEO-Detector
+DATA_PATH=data/crawl_export.csv
+MODEL_DIR=models
+RANDOM_STATE=42

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Technical-SEO-Detector: Technical SEO Issue Detection
+**Type:** Commercial | **Domain:** SEO, Technical SEO
+**Hugging Face:** [syeedalireza/technical-seo-detector](https://huggingface.co/syeedalireza/technical-seo-detector)
+Classify or detect technical SEO issues: redirect chains, thin content, duplicate signals, etc., from URL and page data.
+## Author
+**Alireza Aminzadeh**
+- Hugging Face: [syeedalireza](https://huggingface.co/syeedalireza)
+- LinkedIn: [alirezaaminzadeh](https://www.linkedin.com/in/alirezaaminzadeh)
+- Email: alireza.aminzadeh@hotmail.com
+## Problem
+Automated detection of technical issues (redirect chains, thin content, canonical/duplicate problems) speeds up audits.
+## Approach
+- **Input:** URL, redirect_chain_length, content_length, status_code, canonical_match, etc.
+- **Output:** Issue type (redirect_chain, thin_content, duplicate, ok) or multi-label.
+- **Models:** XGBoost/LightGBM on tabular features; optional text classifier for “thin” from content snippet.
+## Tech Stack
+| Category | Tools |
+|----------|------|
+| ML | scikit-learn, XGBoost |
+| Data | pandas, NumPy |
+| Evaluation | sklearn metrics |
+## Setup
+```bash
+pip install -r requirements.txt
+```
+## Usage
+```bash
+python train.py
+python inference.py --input data/crawl_export.csv --output issues.csv
+```
+## Project structure
+```
+07_technical-seo-detector/
+├── config.py
+├── train.py           # XGBoost classifier for issue_type
+├── inference.py       # Predict issue type for crawl export
+├── requirements.txt
+├── .env.example
+├── data/
+│   └── crawl_export.csv   # Sample: features + issue_type
+└── models/
+```
+## Data
+- **Sample data (included):** `data/crawl_export.csv` — columns: `redirect_chain_length`, `content_length`, `status_code`, `word_count`, `internal_links`, `issue_type` (e.g. `ok`, `redirect_chain`, `thin_content`).
+- Set `DATA_PATH` in `.env` if using another file.
+## License
+MIT.

config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Configuration for Technical-SEO-Detector.
+"""
+import os
+from pathlib import Path
+BASE_DIR = Path(__file__).resolve().parent
+DATA_PATH = os.getenv("DATA_PATH", str(BASE_DIR / "data" / "crawl_export.csv"))
+MODEL_DIR = Path(os.getenv("MODEL_DIR", str(BASE_DIR / "models")))
+RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
+FEATURE_COLUMNS = [
+    "redirect_chain_length",
+    "content_length",
+    "status_code",
+    "word_count",
+    "internal_links",
+]
+TARGET_COLUMN = "issue_type"
+ISSUE_LABELS = ["ok", "redirect_chain", "thin_content", "duplicate", "other"]
+MODEL_DIR.mkdir(parents=True, exist_ok=True)

data/.gitkeep ADDED Viewed

File without changes

data/crawl_export.csv ADDED Viewed

	@@ -0,0 +1,22 @@

+redirect_chain_length,content_length,status_code,word_count,internal_links,issue_type
+0,3200,200,580,45,ok
+2,800,301,120,12,redirect_chain
+0,450,200,85,8,thin_content
+0,2800,200,520,38,ok
+1,1200,302,200,15,redirect_chain
+0,350,200,60,5,thin_content
+0,4100,200,780,62,ok
+0,2900,200,540,42,ok
+3,600,301,95,10,redirect_chain
+0,400,200,70,6,thin_content
+0,3800,200,720,55,ok
+0,2600,200,490,35,ok
+0,500,200,90,7,thin_content
+1,1500,302,250,18,redirect_chain
+0,3400,200,650,48,ok
+0,380,200,65,4,thin_content
+0,3000,200,560,40,ok
+2,900,301,140,11,redirect_chain
+0,4200,200,800,58,ok
+0,2700,200,510,36,ok
+0,550,200,100,9,thin_content

inference.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Predict technical SEO issue type for crawl export rows.
+"""
+import argparse
+from pathlib import Path
+import pandas as pd
+import joblib
+from config import MODEL_DIR, FEATURE_COLUMNS
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", default="data/crawl_export.csv")
+    parser.add_argument("--output", default="issues.csv")
+    args = parser.parse_args()
+    if not Path(args.input).exists():
+        raise FileNotFoundError(f"Input file not found: {args.input}. Run from project root or use an absolute path.")
+    if not (MODEL_DIR / "issue_classifier.joblib").exists():
+        raise FileNotFoundError(f"Run train.py first. No model in {MODEL_DIR}")
+    model = joblib.load(MODEL_DIR / "issue_classifier.joblib")
+    labels = joblib.load(MODEL_DIR / "issue_labels.joblib")
+    features = joblib.load(MODEL_DIR / "feature_columns.joblib") if (MODEL_DIR / "feature_columns.joblib").exists() else [c for c in FEATURE_COLUMNS if c in pd.read_csv(args.input).columns]
+    df = pd.read_csv(args.input)
+    X = df[[c for c in features if c in df.columns]].fillna(0)
+    df["pred_issue_type"] = [labels[i] for i in model.predict(X)]
+    Path(args.output).parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(args.output, index=False)
+    print(f"Saved to {args.output}")
+if __name__ == "__main__":
+    main()

models/.gitkeep ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+# Technical-SEO-Detector
+# Python 3.9+
+pandas>=1.3.0
+numpy>=1.21.0
+scikit-learn>=1.0.0
+xgboost>=1.5.0
+joblib>=1.1.0

train.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+Train technical SEO issue classifier.
+"""
+import json
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, f1_score
+import xgboost as xgb
+import joblib
+from config import (
+    DATA_PATH,
+    MODEL_DIR,
+    RANDOM_STATE,
+    FEATURE_COLUMNS,
+    TARGET_COLUMN,
+    ISSUE_LABELS,
+)
+def main():
+    if not Path(DATA_PATH).exists():
+        print(f"Data not found at {DATA_PATH}. Create data/crawl_export.csv with features and {TARGET_COLUMN}.")
+        return
+    df = pd.read_csv(DATA_PATH)
+    features = [c for c in FEATURE_COLUMNS if c in df.columns]
+    if not features or TARGET_COLUMN not in df.columns:
+        raise ValueError(f"Need features {FEATURE_COLUMNS} and {TARGET_COLUMN}")
+    X = df[features].fillna(0)
+    y = df[TARGET_COLUMN].astype("category").cat.codes
+    X_train, X_val, y_train, y_val = train_test_split(
+        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
+    )
+    model = xgb.XGBClassifier(random_state=RANDOM_STATE, n_estimators=100)
+    model.fit(X_train, y_train)
+    pred = model.predict(X_val)
+    labels = list(df[TARGET_COLUMN].astype("category").cat.categories)
+    report = classification_report(y_val, pred, target_names=labels, output_dict=True)
+    metrics = {"macro_f1": float(f1_score(y_val, pred, average="macro"))}
+    joblib.dump(model, MODEL_DIR / "issue_classifier.joblib")
+    joblib.dump(labels, MODEL_DIR / "issue_labels.joblib")
+    joblib.dump(features, MODEL_DIR / "feature_columns.joblib")
+    with open(MODEL_DIR / "metrics.json", "w") as f:
+        json.dump({"metrics": metrics, "report": report}, f, indent=2)
+    print("Metrics:", metrics)
+if __name__ == "__main__":
+    main()