Spaces:
Runtime error
Runtime error
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| import numpy as np | |
| from sklearn.datasets import make_classification | |
| class MalwareDataset(Dataset): | |
| def __init__(self, num_samples=1000, static_dim=1024, seq_len=512, random_state=42): | |
| """ | |
| Reproducible synthetic dataset for Malware Detection. | |
| Uses sklearn to generate meaningful static features and structured random sequences for dynamic features. | |
| """ | |
| self.num_samples = num_samples | |
| # Generate correlated static features (simulating PE features) | |
| X, y = make_classification( | |
| n_samples=num_samples, | |
| n_features=static_dim, | |
| n_informative=static_dim // 10, | |
| n_classes=2, | |
| random_state=random_state | |
| ) | |
| self.static_features = X.astype(np.float32) | |
| self.labels = y.astype(np.int64) | |
| # Generate dynamic features (simulating API call sequences) | |
| # We make the sequences slightly different based on the label | |
| np.random.seed(random_state) | |
| self.dynamic_sequences = np.zeros((num_samples, seq_len), dtype=np.int64) | |
| for i in range(num_samples): | |
| if self.labels[i] == 1: | |
| # Malware-like sequence (e.g., higher API IDs) | |
| self.dynamic_sequences[i] = np.random.randint(2000, 5000, seq_len) | |
| else: | |
| # Benign-like sequence (e.g., lower API IDs) | |
| self.dynamic_sequences[i] = np.random.randint(0, 3000, seq_len) | |
| def __len__(self): | |
| return self.num_samples | |
| def __getitem__(self, idx): | |
| return { | |
| 'static': torch.tensor(self.static_features[idx]), | |
| 'dynamic': torch.tensor(self.dynamic_sequences[idx]), | |
| 'label': torch.tensor(self.labels[idx]) | |
| } | |
| def get_dataloaders(batch_size=32): | |
| train_dataset = MalwareDataset(num_samples=64, random_state=42) | |
| val_dataset = MalwareDataset(num_samples=32, random_state=123) | |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | |
| val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) | |
| return train_loader, val_loader | |