Alireza Aminzadeh commited on
Commit
fe67d4b
·
verified ·
1 Parent(s): bc11697

Upload folder using huggingface_hub

Browse files
Files changed (9) hide show
  1. .env.example +5 -0
  2. README.md +69 -0
  3. config.py +16 -0
  4. data/.gitkeep +0 -0
  5. data/query_intent.csv +20 -0
  6. inference.py +55 -0
  7. models/.gitkeep +0 -0
  8. requirements.txt +10 -0
  9. train.py +101 -0
.env.example ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # QueryIntent-Entity-NER
2
+ DATA_PATH=data/query_intent.csv
3
+ MODEL_DIR=models
4
+ HF_MODEL=bert-base-uncased
5
+ RANDOM_STATE=42
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QueryIntent-Entity-NER: Query Intent and Entity Extraction for SEO
2
+
3
+ **Type:** Academic | **Domain:** SEO, NLP
4
+ **Hugging Face:** [syeedalireza/query-intent-entity-ner](https://huggingface.co/syeedalireza/query-intent-entity-ner)
5
+
6
+ Multi-task: query intent classification and named-entity extraction for search content planning.
7
+
8
+ ## Author
9
+
10
+ **Alireza Aminzadeh**
11
+ - Hugging Face: [syeedalireza](https://huggingface.co/syeedalireza)
12
+ - LinkedIn: [alirezaaminzadeh](https://www.linkedin.com/in/alirezaaminzadeh)
13
+ - Email: alireza.aminzadeh@hotmail.com
14
+
15
+ ## Problem
16
+
17
+ Understanding intent (informational, navigational, transactional) and key entities in queries improves content and keyword strategy.
18
+
19
+ ## Approach
20
+
21
+ - **Intent:** Multi-class (e.g. informational / navigational / transactional / commercial).
22
+ - **Entities:** NER (ORG, PRODUCT, LOC, etc.) from query text.
23
+ - **Models:** Hugging Face transformer (e.g. BERT) for sequence classification + token classification, or pipeline with spaCy/transformers.
24
+
25
+ ## Tech Stack
26
+
27
+ | Category | Tools |
28
+ |----------|------|
29
+ | NLP | Hugging Face Transformers, tokenizers |
30
+ | NER | spaCy (optional), transformers NER head |
31
+ | ML | PyTorch, scikit-learn |
32
+ | Data | pandas, NumPy |
33
+
34
+ ## Setup
35
+
36
+ ```bash
37
+ pip install -r requirements.txt
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ ```bash
43
+ python train.py
44
+ python inference.py --query "best running shoes for flat feet"
45
+ ```
46
+
47
+ ## Project structure
48
+
49
+ ```
50
+ 03_query-intent-entity-ner/
51
+ ├── config.py
52
+ ├── train.py # BERT (or HF) sequence classification
53
+ ├── inference.py # Single query or batch CSV; CPU/GPU auto
54
+ ├── requirements.txt
55
+ ├── .env.example
56
+ ├── data/
57
+ │ └── query_intent.csv # Sample: query, intent
58
+ └── models/
59
+ ```
60
+
61
+ ## Data
62
+
63
+ - **Sample data (included):** `data/query_intent.csv` — columns: `query`, `intent`.
64
+ - **Intent labels:** `informational`, `navigational`, `transactional`, `commercial`.
65
+ - Set `DATA_PATH` in `.env` if using another file.
66
+
67
+ ## License
68
+
69
+ MIT.
config.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for QueryIntent-Entity-NER.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+
7
+ BASE_DIR = Path(__file__).resolve().parent
8
+ DATA_PATH = os.getenv("DATA_PATH", str(BASE_DIR / "data" / "query_intent.csv"))
9
+ MODEL_DIR = Path(os.getenv("MODEL_DIR", str(BASE_DIR / "models")))
10
+ HF_MODEL = os.getenv("HF_MODEL", "bert-base-uncased")
11
+ RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
12
+
13
+ INTENT_LABELS = ["informational", "navigational", "transactional", "commercial"]
14
+ QUERY_COLUMN = "query"
15
+ INTENT_COLUMN = "intent"
16
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
data/.gitkeep ADDED
File without changes
data/query_intent.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ query,intent
2
+ what is seo,informational
3
+ how to rank on google,informational
4
+ nike official store,navigational
5
+ buy wireless headphones,transactional
6
+ best CRM software comparison,commercial
7
+ python documentation,navigational
8
+ why is the sky blue,informational
9
+ cheap flight tickets,transactional
10
+ difference between http and https,informational
11
+ amazon login,navigational
12
+ best project management tools,commercial
13
+ how to install docker,informational
14
+ shop running shoes,transactional
15
+ linkedin homepage,navigational
16
+ what is tensorflow,informational
17
+ best antivirus 2024,commercial
18
+ how to make pasta,informational
19
+ netflix sign in,navigational
20
+ buy domain name,transactional
inference.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Predict query intent for one or more queries.
3
+ """
4
+ import argparse
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+
11
+ from config import MODEL_DIR, INTENT_LABELS
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--query", type=str, help="Single query")
17
+ parser.add_argument("--input", type=str, help="CSV with 'query' column")
18
+ parser.add_argument("--output", type=str, default="predictions.csv")
19
+ args = parser.parse_args()
20
+
21
+ if not (MODEL_DIR / "config.json").exists():
22
+ raise FileNotFoundError(f"Train first. No model in {MODEL_DIR}")
23
+
24
+ tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR))
25
+ model = AutoModelForSequenceClassification.from_pretrained(str(MODEL_DIR))
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ model = model.to(device)
28
+
29
+ def predict(queries: list[str]):
30
+ inp = tokenizer(queries, truncation=True, max_length=128, padding=True, return_tensors="pt")
31
+ inp = {k: v.to(device) for k, v in inp.items()}
32
+ with torch.no_grad():
33
+ out = model(**inp)
34
+ return out.logits.argmax(dim=1).tolist()
35
+
36
+ if args.query:
37
+ idx = predict([args.query])[0]
38
+ print({"query": args.query, "intent": INTENT_LABELS[idx]})
39
+ return
40
+
41
+ if args.input and Path(args.input).exists():
42
+ df = pd.read_csv(args.input)
43
+ if "query" not in df.columns:
44
+ raise ValueError("CSV must have 'query' column")
45
+ indices = predict(df["query"].astype(str).tolist())
46
+ df["intent"] = [INTENT_LABELS[i] for i in indices]
47
+ df.to_csv(args.output, index=False)
48
+ print(f"Saved to {args.output}")
49
+ return
50
+
51
+ print("Use --query 'text' or --input file.csv")
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main()
models/.gitkeep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # QueryIntent-Entity-NER
2
+ # Python 3.9+
3
+
4
+ torch>=1.12.0
5
+ transformers>=4.20.0
6
+ tokenizers>=0.12.0
7
+ datasets>=2.0.0
8
+ scikit-learn>=1.0.0
9
+ pandas>=1.3.0
10
+ numpy>=1.21.0
train.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train query intent classifier.
3
+ Uses Hugging Face transformer for sequence classification.
4
+ """
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.metrics import classification_report, f1_score
11
+ from transformers import (
12
+ AutoTokenizer,
13
+ AutoModelForSequenceClassification,
14
+ TrainingArguments,
15
+ Trainer,
16
+ )
17
+ from datasets import Dataset
18
+
19
+ from config import (
20
+ DATA_PATH,
21
+ MODEL_DIR,
22
+ RANDOM_STATE,
23
+ HF_MODEL,
24
+ QUERY_COLUMN,
25
+ INTENT_COLUMN,
26
+ INTENT_LABELS,
27
+ )
28
+
29
+
30
+ def load_data(path: str) -> pd.DataFrame:
31
+ df = pd.read_csv(path)
32
+ if QUERY_COLUMN not in df.columns or INTENT_COLUMN not in df.columns:
33
+ raise ValueError(f"Need columns: {QUERY_COLUMN}, {INTENT_COLUMN}")
34
+ return df
35
+
36
+
37
+ def main():
38
+ if not Path(DATA_PATH).exists():
39
+ print(f"Data not found at {DATA_PATH}. Create data/query_intent.csv with query, intent.")
40
+ return
41
+
42
+ df = load_data(DATA_PATH)
43
+ df[INTENT_COLUMN] = pd.Categorical(df[INTENT_COLUMN], categories=INTENT_LABELS)
44
+ df["label"] = df[INTENT_COLUMN].cat.codes
45
+
46
+ train_df, val_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE, stratify=df["label"])
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
49
+ model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL, num_labels=len(INTENT_LABELS))
50
+
51
+ def tokenize(examples):
52
+ return tokenizer(examples[QUERY_COLUMN], truncation=True, max_length=128, padding="max_length")
53
+
54
+ train_ds = Dataset.from_pandas(train_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN}))
55
+ val_ds = Dataset.from_pandas(val_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN}))
56
+ train_ds = train_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN])
57
+ val_ds = val_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN])
58
+ train_ds.set_format("torch")
59
+ val_ds.set_format("torch")
60
+
61
+ args = TrainingArguments(
62
+ output_dir=str(MODEL_DIR),
63
+ num_train_epochs=3,
64
+ per_device_train_batch_size=16,
65
+ per_device_eval_batch_size=32,
66
+ evaluation_strategy="epoch",
67
+ save_strategy="epoch",
68
+ load_best_model_at_end=True,
69
+ metric_for_best_model="f1",
70
+ greater_is_better=True,
71
+ )
72
+
73
+ def compute_metrics(eval_pred):
74
+ preds = eval_pred.predictions.argmax(axis=1)
75
+ return {"f1": float(f1_score(eval_pred.label_ids, preds, average="macro"))}
76
+
77
+ trainer = Trainer(
78
+ model=model,
79
+ args=args,
80
+ train_dataset=train_ds,
81
+ eval_dataset=val_ds,
82
+ compute_metrics=compute_metrics,
83
+ )
84
+ trainer.train()
85
+ trainer.save_model(str(MODEL_DIR))
86
+ tokenizer.save_pretrained(str(MODEL_DIR))
87
+
88
+ pred = trainer.predict(val_ds)
89
+ report = classification_report(
90
+ val_df["label"].values,
91
+ pred.predictions.argmax(axis=1),
92
+ target_names=INTENT_LABELS,
93
+ output_dict=True,
94
+ )
95
+ with open(MODEL_DIR / "metrics.json", "w") as f:
96
+ json.dump({"classification_report": report}, f, indent=2)
97
+ print(classification_report(val_df["label"].values, pred.predictions.argmax(axis=1), target_names=INTENT_LABELS))
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()