import torch from torch.utils.data import Dataset, DataLoader import numpy as np from sklearn.datasets import make_classification class MalwareDataset(Dataset): def __init__(self, num_samples=1000, static_dim=1024, seq_len=512, random_state=42): """ Reproducible synthetic dataset for Malware Detection. Uses sklearn to generate meaningful static features and structured random sequences for dynamic features. """ self.num_samples = num_samples # Generate correlated static features (simulating PE features) X, y = make_classification( n_samples=num_samples, n_features=static_dim, n_informative=static_dim // 10, n_classes=2, random_state=random_state ) self.static_features = X.astype(np.float32) self.labels = y.astype(np.int64) # Generate dynamic features (simulating API call sequences) # We make the sequences slightly different based on the label np.random.seed(random_state) self.dynamic_sequences = np.zeros((num_samples, seq_len), dtype=np.int64) for i in range(num_samples): if self.labels[i] == 1: # Malware-like sequence (e.g., higher API IDs) self.dynamic_sequences[i] = np.random.randint(2000, 5000, seq_len) else: # Benign-like sequence (e.g., lower API IDs) self.dynamic_sequences[i] = np.random.randint(0, 3000, seq_len) def __len__(self): return self.num_samples def __getitem__(self, idx): return { 'static': torch.tensor(self.static_features[idx]), 'dynamic': torch.tensor(self.dynamic_sequences[idx]), 'label': torch.tensor(self.labels[idx]) } def get_dataloaders(batch_size=32): train_dataset = MalwareDataset(num_samples=64, random_state=42) val_dataset = MalwareDataset(num_samples=32, random_state=123) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) return train_loader, val_loader