File size: 2,286 Bytes
6c247ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.datasets import make_classification

class MalwareDataset(Dataset):
    def __init__(self, num_samples=1000, static_dim=1024, seq_len=512, random_state=42):
        """

        Reproducible synthetic dataset for Malware Detection.

        Uses sklearn to generate meaningful static features and structured random sequences for dynamic features.

        """
        self.num_samples = num_samples
        
        # Generate correlated static features (simulating PE features)
        X, y = make_classification(
            n_samples=num_samples, 
            n_features=static_dim, 
            n_informative=static_dim // 10, 
            n_classes=2, 
            random_state=random_state
        )
        self.static_features = X.astype(np.float32)
        self.labels = y.astype(np.int64)
        
        # Generate dynamic features (simulating API call sequences)
        # We make the sequences slightly different based on the label
        np.random.seed(random_state)
        self.dynamic_sequences = np.zeros((num_samples, seq_len), dtype=np.int64)
        
        for i in range(num_samples):
            if self.labels[i] == 1:
                # Malware-like sequence (e.g., higher API IDs)
                self.dynamic_sequences[i] = np.random.randint(2000, 5000, seq_len)
            else:
                # Benign-like sequence (e.g., lower API IDs)
                self.dynamic_sequences[i] = np.random.randint(0, 3000, seq_len)
        
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        return {
            'static': torch.tensor(self.static_features[idx]),
            'dynamic': torch.tensor(self.dynamic_sequences[idx]),
            'label': torch.tensor(self.labels[idx])
        }

def get_dataloaders(batch_size=32):
    train_dataset = MalwareDataset(num_samples=64, random_state=42)
    val_dataset = MalwareDataset(num_samples=32, random_state=123)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader