Alireza Aminzadeh commited on
Commit
8f4bcf1
·
verified ·
1 Parent(s): 35bcd7b

Upload folder using huggingface_hub

Browse files
Files changed (9) hide show
  1. .env.example +4 -0
  2. README.md +67 -0
  3. config.py +21 -0
  4. data/.gitkeep +0 -0
  5. data/crawl_export.csv +22 -0
  6. inference.py +38 -0
  7. models/.gitkeep +0 -0
  8. requirements.txt +8 -0
  9. train.py +55 -0
.env.example ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Technical-SEO-Detector
2
+ DATA_PATH=data/crawl_export.csv
3
+ MODEL_DIR=models
4
+ RANDOM_STATE=42
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Technical-SEO-Detector: Technical SEO Issue Detection
2
+
3
+ **Type:** Commercial | **Domain:** SEO, Technical SEO
4
+ **Hugging Face:** [syeedalireza/technical-seo-detector](https://huggingface.co/syeedalireza/technical-seo-detector)
5
+
6
+ Classify or detect technical SEO issues: redirect chains, thin content, duplicate signals, etc., from URL and page data.
7
+
8
+ ## Author
9
+
10
+ **Alireza Aminzadeh**
11
+ - Hugging Face: [syeedalireza](https://huggingface.co/syeedalireza)
12
+ - LinkedIn: [alirezaaminzadeh](https://www.linkedin.com/in/alirezaaminzadeh)
13
+ - Email: alireza.aminzadeh@hotmail.com
14
+
15
+ ## Problem
16
+
17
+ Automated detection of technical issues (redirect chains, thin content, canonical/duplicate problems) speeds up audits.
18
+
19
+ ## Approach
20
+
21
+ - **Input:** URL, redirect_chain_length, content_length, status_code, canonical_match, etc.
22
+ - **Output:** Issue type (redirect_chain, thin_content, duplicate, ok) or multi-label.
23
+ - **Models:** XGBoost/LightGBM on tabular features; optional text classifier for “thin” from content snippet.
24
+
25
+ ## Tech Stack
26
+
27
+ | Category | Tools |
28
+ |----------|------|
29
+ | ML | scikit-learn, XGBoost |
30
+ | Data | pandas, NumPy |
31
+ | Evaluation | sklearn metrics |
32
+
33
+ ## Setup
34
+
35
+ ```bash
36
+ pip install -r requirements.txt
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```bash
42
+ python train.py
43
+ python inference.py --input data/crawl_export.csv --output issues.csv
44
+ ```
45
+
46
+ ## Project structure
47
+
48
+ ```
49
+ 07_technical-seo-detector/
50
+ ├── config.py
51
+ ├── train.py # XGBoost classifier for issue_type
52
+ ├── inference.py # Predict issue type for crawl export
53
+ ├── requirements.txt
54
+ ├── .env.example
55
+ ├── data/
56
+ │ └── crawl_export.csv # Sample: features + issue_type
57
+ └── models/
58
+ ```
59
+
60
+ ## Data
61
+
62
+ - **Sample data (included):** `data/crawl_export.csv` — columns: `redirect_chain_length`, `content_length`, `status_code`, `word_count`, `internal_links`, `issue_type` (e.g. `ok`, `redirect_chain`, `thin_content`).
63
+ - Set `DATA_PATH` in `.env` if using another file.
64
+
65
+ ## License
66
+
67
+ MIT.
config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for Technical-SEO-Detector.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+
7
+ BASE_DIR = Path(__file__).resolve().parent
8
+ DATA_PATH = os.getenv("DATA_PATH", str(BASE_DIR / "data" / "crawl_export.csv"))
9
+ MODEL_DIR = Path(os.getenv("MODEL_DIR", str(BASE_DIR / "models")))
10
+ RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
11
+
12
+ FEATURE_COLUMNS = [
13
+ "redirect_chain_length",
14
+ "content_length",
15
+ "status_code",
16
+ "word_count",
17
+ "internal_links",
18
+ ]
19
+ TARGET_COLUMN = "issue_type"
20
+ ISSUE_LABELS = ["ok", "redirect_chain", "thin_content", "duplicate", "other"]
21
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
data/.gitkeep ADDED
File without changes
data/crawl_export.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ redirect_chain_length,content_length,status_code,word_count,internal_links,issue_type
2
+ 0,3200,200,580,45,ok
3
+ 2,800,301,120,12,redirect_chain
4
+ 0,450,200,85,8,thin_content
5
+ 0,2800,200,520,38,ok
6
+ 1,1200,302,200,15,redirect_chain
7
+ 0,350,200,60,5,thin_content
8
+ 0,4100,200,780,62,ok
9
+ 0,2900,200,540,42,ok
10
+ 3,600,301,95,10,redirect_chain
11
+ 0,400,200,70,6,thin_content
12
+ 0,3800,200,720,55,ok
13
+ 0,2600,200,490,35,ok
14
+ 0,500,200,90,7,thin_content
15
+ 1,1500,302,250,18,redirect_chain
16
+ 0,3400,200,650,48,ok
17
+ 0,380,200,65,4,thin_content
18
+ 0,3000,200,560,40,ok
19
+ 2,900,301,140,11,redirect_chain
20
+ 0,4200,200,800,58,ok
21
+ 0,2700,200,510,36,ok
22
+ 0,550,200,100,9,thin_content
inference.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Predict technical SEO issue type for crawl export rows.
3
+ """
4
+ import argparse
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ import joblib
9
+
10
+ from config import MODEL_DIR, FEATURE_COLUMNS
11
+
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument("--input", default="data/crawl_export.csv")
16
+ parser.add_argument("--output", default="issues.csv")
17
+ args = parser.parse_args()
18
+
19
+ if not Path(args.input).exists():
20
+ raise FileNotFoundError(f"Input file not found: {args.input}. Run from project root or use an absolute path.")
21
+ if not (MODEL_DIR / "issue_classifier.joblib").exists():
22
+ raise FileNotFoundError(f"Run train.py first. No model in {MODEL_DIR}")
23
+
24
+ model = joblib.load(MODEL_DIR / "issue_classifier.joblib")
25
+ labels = joblib.load(MODEL_DIR / "issue_labels.joblib")
26
+ features = joblib.load(MODEL_DIR / "feature_columns.joblib") if (MODEL_DIR / "feature_columns.joblib").exists() else [c for c in FEATURE_COLUMNS if c in pd.read_csv(args.input).columns]
27
+
28
+ df = pd.read_csv(args.input)
29
+ X = df[[c for c in features if c in df.columns]].fillna(0)
30
+ df["pred_issue_type"] = [labels[i] for i in model.predict(X)]
31
+
32
+ Path(args.output).parent.mkdir(parents=True, exist_ok=True)
33
+ df.to_csv(args.output, index=False)
34
+ print(f"Saved to {args.output}")
35
+
36
+
37
+ if __name__ == "__main__":
38
+ main()
models/.gitkeep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Technical-SEO-Detector
2
+ # Python 3.9+
3
+
4
+ pandas>=1.3.0
5
+ numpy>=1.21.0
6
+ scikit-learn>=1.0.0
7
+ xgboost>=1.5.0
8
+ joblib>=1.1.0
train.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train technical SEO issue classifier.
3
+ """
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import classification_report, f1_score
10
+ import xgboost as xgb
11
+ import joblib
12
+
13
+ from config import (
14
+ DATA_PATH,
15
+ MODEL_DIR,
16
+ RANDOM_STATE,
17
+ FEATURE_COLUMNS,
18
+ TARGET_COLUMN,
19
+ ISSUE_LABELS,
20
+ )
21
+
22
+
23
+ def main():
24
+ if not Path(DATA_PATH).exists():
25
+ print(f"Data not found at {DATA_PATH}. Create data/crawl_export.csv with features and {TARGET_COLUMN}.")
26
+ return
27
+
28
+ df = pd.read_csv(DATA_PATH)
29
+ features = [c for c in FEATURE_COLUMNS if c in df.columns]
30
+ if not features or TARGET_COLUMN not in df.columns:
31
+ raise ValueError(f"Need features {FEATURE_COLUMNS} and {TARGET_COLUMN}")
32
+
33
+ X = df[features].fillna(0)
34
+ y = df[TARGET_COLUMN].astype("category").cat.codes
35
+
36
+ X_train, X_val, y_train, y_val = train_test_split(
37
+ X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
38
+ )
39
+ model = xgb.XGBClassifier(random_state=RANDOM_STATE, n_estimators=100)
40
+ model.fit(X_train, y_train)
41
+ pred = model.predict(X_val)
42
+
43
+ labels = list(df[TARGET_COLUMN].astype("category").cat.categories)
44
+ report = classification_report(y_val, pred, target_names=labels, output_dict=True)
45
+ metrics = {"macro_f1": float(f1_score(y_val, pred, average="macro"))}
46
+ joblib.dump(model, MODEL_DIR / "issue_classifier.joblib")
47
+ joblib.dump(labels, MODEL_DIR / "issue_labels.joblib")
48
+ joblib.dump(features, MODEL_DIR / "feature_columns.joblib")
49
+ with open(MODEL_DIR / "metrics.json", "w") as f:
50
+ json.dump({"metrics": metrics, "report": report}, f, indent=2)
51
+ print("Metrics:", metrics)
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main()