ZeroDay-Sentinel / src /data_loader.py
Alireza Aminzadeh
Upload folder using huggingface_hub
6c247ff verified
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.datasets import make_classification
class MalwareDataset(Dataset):
def __init__(self, num_samples=1000, static_dim=1024, seq_len=512, random_state=42):
"""
Reproducible synthetic dataset for Malware Detection.
Uses sklearn to generate meaningful static features and structured random sequences for dynamic features.
"""
self.num_samples = num_samples
# Generate correlated static features (simulating PE features)
X, y = make_classification(
n_samples=num_samples,
n_features=static_dim,
n_informative=static_dim // 10,
n_classes=2,
random_state=random_state
)
self.static_features = X.astype(np.float32)
self.labels = y.astype(np.int64)
# Generate dynamic features (simulating API call sequences)
# We make the sequences slightly different based on the label
np.random.seed(random_state)
self.dynamic_sequences = np.zeros((num_samples, seq_len), dtype=np.int64)
for i in range(num_samples):
if self.labels[i] == 1:
# Malware-like sequence (e.g., higher API IDs)
self.dynamic_sequences[i] = np.random.randint(2000, 5000, seq_len)
else:
# Benign-like sequence (e.g., lower API IDs)
self.dynamic_sequences[i] = np.random.randint(0, 3000, seq_len)
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
return {
'static': torch.tensor(self.static_features[idx]),
'dynamic': torch.tensor(self.dynamic_sequences[idx]),
'label': torch.tensor(self.labels[idx])
}
def get_dataloaders(batch_size=32):
train_dataset = MalwareDataset(num_samples=64, random_state=42)
val_dataset = MalwareDataset(num_samples=32, random_state=123)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
return train_loader, val_loader