Alireza Aminzadeh commited on
Upload folder using huggingface_hub
Browse files- .env.example +4 -0
- README.md +67 -0
- config.py +21 -0
- data/.gitkeep +0 -0
- data/crawl_export.csv +22 -0
- inference.py +38 -0
- models/.gitkeep +0 -0
- requirements.txt +8 -0
- train.py +55 -0
.env.example
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Technical-SEO-Detector
|
| 2 |
+
DATA_PATH=data/crawl_export.csv
|
| 3 |
+
MODEL_DIR=models
|
| 4 |
+
RANDOM_STATE=42
|
README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Technical-SEO-Detector: Technical SEO Issue Detection
|
| 2 |
+
|
| 3 |
+
**Type:** Commercial | **Domain:** SEO, Technical SEO
|
| 4 |
+
**Hugging Face:** [syeedalireza/technical-seo-detector](https://huggingface.co/syeedalireza/technical-seo-detector)
|
| 5 |
+
|
| 6 |
+
Classify or detect technical SEO issues: redirect chains, thin content, duplicate signals, etc., from URL and page data.
|
| 7 |
+
|
| 8 |
+
## Author
|
| 9 |
+
|
| 10 |
+
**Alireza Aminzadeh**
|
| 11 |
+
- Hugging Face: [syeedalireza](https://huggingface.co/syeedalireza)
|
| 12 |
+
- LinkedIn: [alirezaaminzadeh](https://www.linkedin.com/in/alirezaaminzadeh)
|
| 13 |
+
- Email: alireza.aminzadeh@hotmail.com
|
| 14 |
+
|
| 15 |
+
## Problem
|
| 16 |
+
|
| 17 |
+
Automated detection of technical issues (redirect chains, thin content, canonical/duplicate problems) speeds up audits.
|
| 18 |
+
|
| 19 |
+
## Approach
|
| 20 |
+
|
| 21 |
+
- **Input:** URL, redirect_chain_length, content_length, status_code, canonical_match, etc.
|
| 22 |
+
- **Output:** Issue type (redirect_chain, thin_content, duplicate, ok) or multi-label.
|
| 23 |
+
- **Models:** XGBoost/LightGBM on tabular features; optional text classifier for “thin” from content snippet.
|
| 24 |
+
|
| 25 |
+
## Tech Stack
|
| 26 |
+
|
| 27 |
+
| Category | Tools |
|
| 28 |
+
|----------|------|
|
| 29 |
+
| ML | scikit-learn, XGBoost |
|
| 30 |
+
| Data | pandas, NumPy |
|
| 31 |
+
| Evaluation | sklearn metrics |
|
| 32 |
+
|
| 33 |
+
## Setup
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Usage
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
python train.py
|
| 43 |
+
python inference.py --input data/crawl_export.csv --output issues.csv
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## Project structure
|
| 47 |
+
|
| 48 |
+
```
|
| 49 |
+
07_technical-seo-detector/
|
| 50 |
+
├── config.py
|
| 51 |
+
├── train.py # XGBoost classifier for issue_type
|
| 52 |
+
├── inference.py # Predict issue type for crawl export
|
| 53 |
+
├── requirements.txt
|
| 54 |
+
├── .env.example
|
| 55 |
+
├── data/
|
| 56 |
+
│ └── crawl_export.csv # Sample: features + issue_type
|
| 57 |
+
└── models/
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## Data
|
| 61 |
+
|
| 62 |
+
- **Sample data (included):** `data/crawl_export.csv` — columns: `redirect_chain_length`, `content_length`, `status_code`, `word_count`, `internal_links`, `issue_type` (e.g. `ok`, `redirect_chain`, `thin_content`).
|
| 63 |
+
- Set `DATA_PATH` in `.env` if using another file.
|
| 64 |
+
|
| 65 |
+
## License
|
| 66 |
+
|
| 67 |
+
MIT.
|
config.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration for Technical-SEO-Detector.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 8 |
+
DATA_PATH = os.getenv("DATA_PATH", str(BASE_DIR / "data" / "crawl_export.csv"))
|
| 9 |
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", str(BASE_DIR / "models")))
|
| 10 |
+
RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
|
| 11 |
+
|
| 12 |
+
FEATURE_COLUMNS = [
|
| 13 |
+
"redirect_chain_length",
|
| 14 |
+
"content_length",
|
| 15 |
+
"status_code",
|
| 16 |
+
"word_count",
|
| 17 |
+
"internal_links",
|
| 18 |
+
]
|
| 19 |
+
TARGET_COLUMN = "issue_type"
|
| 20 |
+
ISSUE_LABELS = ["ok", "redirect_chain", "thin_content", "duplicate", "other"]
|
| 21 |
+
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
data/.gitkeep
ADDED
|
File without changes
|
data/crawl_export.csv
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
redirect_chain_length,content_length,status_code,word_count,internal_links,issue_type
|
| 2 |
+
0,3200,200,580,45,ok
|
| 3 |
+
2,800,301,120,12,redirect_chain
|
| 4 |
+
0,450,200,85,8,thin_content
|
| 5 |
+
0,2800,200,520,38,ok
|
| 6 |
+
1,1200,302,200,15,redirect_chain
|
| 7 |
+
0,350,200,60,5,thin_content
|
| 8 |
+
0,4100,200,780,62,ok
|
| 9 |
+
0,2900,200,540,42,ok
|
| 10 |
+
3,600,301,95,10,redirect_chain
|
| 11 |
+
0,400,200,70,6,thin_content
|
| 12 |
+
0,3800,200,720,55,ok
|
| 13 |
+
0,2600,200,490,35,ok
|
| 14 |
+
0,500,200,90,7,thin_content
|
| 15 |
+
1,1500,302,250,18,redirect_chain
|
| 16 |
+
0,3400,200,650,48,ok
|
| 17 |
+
0,380,200,65,4,thin_content
|
| 18 |
+
0,3000,200,560,40,ok
|
| 19 |
+
2,900,301,140,11,redirect_chain
|
| 20 |
+
0,4200,200,800,58,ok
|
| 21 |
+
0,2700,200,510,36,ok
|
| 22 |
+
0,550,200,100,9,thin_content
|
inference.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Predict technical SEO issue type for crawl export rows.
|
| 3 |
+
"""
|
| 4 |
+
import argparse
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import joblib
|
| 9 |
+
|
| 10 |
+
from config import MODEL_DIR, FEATURE_COLUMNS
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main():
|
| 14 |
+
parser = argparse.ArgumentParser()
|
| 15 |
+
parser.add_argument("--input", default="data/crawl_export.csv")
|
| 16 |
+
parser.add_argument("--output", default="issues.csv")
|
| 17 |
+
args = parser.parse_args()
|
| 18 |
+
|
| 19 |
+
if not Path(args.input).exists():
|
| 20 |
+
raise FileNotFoundError(f"Input file not found: {args.input}. Run from project root or use an absolute path.")
|
| 21 |
+
if not (MODEL_DIR / "issue_classifier.joblib").exists():
|
| 22 |
+
raise FileNotFoundError(f"Run train.py first. No model in {MODEL_DIR}")
|
| 23 |
+
|
| 24 |
+
model = joblib.load(MODEL_DIR / "issue_classifier.joblib")
|
| 25 |
+
labels = joblib.load(MODEL_DIR / "issue_labels.joblib")
|
| 26 |
+
features = joblib.load(MODEL_DIR / "feature_columns.joblib") if (MODEL_DIR / "feature_columns.joblib").exists() else [c for c in FEATURE_COLUMNS if c in pd.read_csv(args.input).columns]
|
| 27 |
+
|
| 28 |
+
df = pd.read_csv(args.input)
|
| 29 |
+
X = df[[c for c in features if c in df.columns]].fillna(0)
|
| 30 |
+
df["pred_issue_type"] = [labels[i] for i in model.predict(X)]
|
| 31 |
+
|
| 32 |
+
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
df.to_csv(args.output, index=False)
|
| 34 |
+
print(f"Saved to {args.output}")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
main()
|
models/.gitkeep
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Technical-SEO-Detector
|
| 2 |
+
# Python 3.9+
|
| 3 |
+
|
| 4 |
+
pandas>=1.3.0
|
| 5 |
+
numpy>=1.21.0
|
| 6 |
+
scikit-learn>=1.0.0
|
| 7 |
+
xgboost>=1.5.0
|
| 8 |
+
joblib>=1.1.0
|
train.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Train technical SEO issue classifier.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from sklearn.metrics import classification_report, f1_score
|
| 10 |
+
import xgboost as xgb
|
| 11 |
+
import joblib
|
| 12 |
+
|
| 13 |
+
from config import (
|
| 14 |
+
DATA_PATH,
|
| 15 |
+
MODEL_DIR,
|
| 16 |
+
RANDOM_STATE,
|
| 17 |
+
FEATURE_COLUMNS,
|
| 18 |
+
TARGET_COLUMN,
|
| 19 |
+
ISSUE_LABELS,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def main():
|
| 24 |
+
if not Path(DATA_PATH).exists():
|
| 25 |
+
print(f"Data not found at {DATA_PATH}. Create data/crawl_export.csv with features and {TARGET_COLUMN}.")
|
| 26 |
+
return
|
| 27 |
+
|
| 28 |
+
df = pd.read_csv(DATA_PATH)
|
| 29 |
+
features = [c for c in FEATURE_COLUMNS if c in df.columns]
|
| 30 |
+
if not features or TARGET_COLUMN not in df.columns:
|
| 31 |
+
raise ValueError(f"Need features {FEATURE_COLUMNS} and {TARGET_COLUMN}")
|
| 32 |
+
|
| 33 |
+
X = df[features].fillna(0)
|
| 34 |
+
y = df[TARGET_COLUMN].astype("category").cat.codes
|
| 35 |
+
|
| 36 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
| 37 |
+
X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
|
| 38 |
+
)
|
| 39 |
+
model = xgb.XGBClassifier(random_state=RANDOM_STATE, n_estimators=100)
|
| 40 |
+
model.fit(X_train, y_train)
|
| 41 |
+
pred = model.predict(X_val)
|
| 42 |
+
|
| 43 |
+
labels = list(df[TARGET_COLUMN].astype("category").cat.categories)
|
| 44 |
+
report = classification_report(y_val, pred, target_names=labels, output_dict=True)
|
| 45 |
+
metrics = {"macro_f1": float(f1_score(y_val, pred, average="macro"))}
|
| 46 |
+
joblib.dump(model, MODEL_DIR / "issue_classifier.joblib")
|
| 47 |
+
joblib.dump(labels, MODEL_DIR / "issue_labels.joblib")
|
| 48 |
+
joblib.dump(features, MODEL_DIR / "feature_columns.joblib")
|
| 49 |
+
with open(MODEL_DIR / "metrics.json", "w") as f:
|
| 50 |
+
json.dump({"metrics": metrics, "report": report}, f, indent=2)
|
| 51 |
+
print("Metrics:", metrics)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
main()
|