MalwareClassify/LoadData.py at main · Spajed/MalwareClassify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import os
import ember
import numpy as np
import sklearn
import tensorflow as tf
import pickle as pk

from sklearn.decomposition import IncrementalPCA


def init_vectorized_features(dataset_dir: str):
    """
    Required for the generation of '.dat' data files

    :param dataset_dir: directory to the base location of the dataset
    :return:
    """
    try:
        assert(os.path.exists(dataset_dir))

        ember.create_vectorized_features(dataset_dir, 1)
    except AssertionError:
        raise Exception(
            "[ASSERTION ERROR] The path to base directory of dataset provided does not exist"
        )


def dat_to_train_test(dat_dir: str):
    """
    Loading training & testing data from respective generated '.dat' files

    :param dat_dir: directory to the base location where generated '.dat' files are found
    :return:
    """
    try:
        assert('X_train.dat' in os.listdir(dat_dir))
        assert('y_train.dat' in os.listdir(dat_dir))
        assert ('X_test.dat' in os.listdir(dat_dir))
        assert ('y_test.dat' in os.listdir(dat_dir))

        x_train, y_train = ember.read_vectorized_features(dat_dir, subset="train")
        x_test, y_test = ember.read_vectorized_features(dat_dir, subset="test")

        return x_train, y_train, x_test, y_test
    except AssertionError:
        raise Exception(
            "[ASSERTION ERROR] Ensure that the required '.dat' files are found within the specified directory"
        )


def __dataset_generator(data: np.memmap, labels: np.memmap):
    """
    Helper function for conversion from numpy.memmap to tf.data.Dataset
    Create callable generator for tf.data.Dataset.from_generator()

    :param data: numpy.memmap of training data
    :param labels: numpy.memmap of labels corresponding to the training data
    :return:
    """
    # requires nothing to be passed to generator to avoid "TypeError: 'generator' must be callable." error
    def generator():
        for instance, label in zip(data, labels):
            yield instance, label
    return generator


def to_tf_dataset(x_memmap_data: np.memmap, y_memmap_data: np.memmap):
    """
    Convert numpy.memmap to tf.data.Dataset via the creation of generator with helper function '__dataset_generator()'

    :return:
    """
    return tf.data.Dataset.from_generator(__dataset_generator(x_memmap_data, y_memmap_data),
                                          output_types=(x_memmap_data.dtype, y_memmap_data.dtype),
                                          output_shapes=([x_memmap_data.shape[1], ], []))


def __unlabelled(data: tf.Tensor, label: tf.Tensor):
    """
    Helper function to act as a callable conditional statement for tf.data.Dataset.filter()

    Note: data is an unused parameter but is necessary for the proper functionality of this function
    (i.e. do not remove)

    :param data: tensor representation of data within the tf.data.Dataset
    :param label: tensor representation of label for the respective data within tf.data.Dataset
    :return:
    """
    if label == -1.0:
        return False
    return True


def rm_unlabelled_samples(dataset: tf.data.Dataset):
    """
    Filter all unlabelled data instances (label = -1.0) from the tf.data.Dataset passed in as parameter

    :param dataset: dataset in which the unlabelled instances (label = -1.0) are to be filtered out.
    :return:
    """
    return dataset.filter(__unlabelled)


def to_batch_dataset(dataset: tf.data.Dataset, batchsize: int = 100, drop_remainder: bool = False):
    """
    Function for converting from tf.data.Dataset type output by the `from_generator` function to a `BatchDataset`

    :param dataset: Tensorflow dataset generated from the use of `from_generator` Tensorflow function
    :param batchsize: The number of data records to be included in the batches for training
    :param drop_remainder: Boolean for determining whether or not data samples that dont fit in the specified batches
    should be dropped or not
    :return:
    """
    return dataset.batch(batchsize, drop_remainder)

def quantile_normalize_data(dataset: np.memmap):
    """
    Function normalize a dataset using Robust scaling on each sample

    :param dataset: The input dataset in numpy memmap format (Before conversion to Tensor)
    :return: Return the Robust scaled dataset
    """
    scaler = sklearn.preprocessing.QuantileTransformer()
    scaler.fit(dataset)
    pk.dump(scaler, open("scaler.pkl","wb"))
    norm = scaler.transform(dataset)
    return norm

def quantile_normal_normalize_data(dataset: np.memmap):
    """
    Function normalize a dataset using Robust scaling on each sample

    :param dataset: The input dataset in numpy memmap format (Before conversion to Tensor)
    :return: Return the Robust scaled dataset
    """
    scaler = sklearn.preprocessing.QuantileTransformer(output_distribution='normal')
    norm = scaler.fit_transform(dataset)
    return norm

def standardscaler_normalize_data(dataset: np.memmap):
    """
    Function normalize a dataset using Robust scaling on each sample

    :param dataset: The input dataset in numpy memmap format (Before conversion to Tensor)
    :return: Return the Robust scaled dataset
    """
    scaler = sklearn.preprocessing.StandardScaler()
    norm = scaler.fit_transform(dataset)
    return norm

def normalizer_normalize_data(dataset: np.memmap):
    """
    Function normalize a dataset using Robust scaling on each sample

    :param dataset: The input dataset in numpy memmap format (Before conversion to Tensor)
    :return: Return the Robust scaled dataset
    """
    scaler = sklearn.preprocessing.Normalizer()
    norm = scaler.fit_transform(dataset)
    return norm


def dataset_pca_reduction(train_dataset: np.memmap, test_dataset: np.memmap, num_rows_train: int = 800000,
                          num_rows_test: int = 200000, num_components: int = 500, chunk_size: int = 100000):
    """
    Function to perform PCA dimensionality reduction on a numpy array (or memmap)

    :param train_dataset: Numpy memmap containing feature vectors for training set
    :param test_dataset: Numpy memmap containing feature vectors for testing set
    :param num_rows_train: Total number of samples within the training dataset
    :param num_rows_test: Total number of samples within the testing dataset
    :param num_components: Number of dimensions output by PCA
    :param chunk_size: Even number of chunks by which the training dataset is split
    :return: PCA reduction of the original training and testing datasets
    """

    ipca = IncrementalPCA(n_components=num_components, batch_size=chunk_size)

    for i in range(0, len(train_dataset)//chunk_size):
        print(f"chunk {i} out of {len(train_dataset)//chunk_size}")
        ipca.partial_fit(train_dataset[i*chunk_size: (i+1)*chunk_size])
    print("Number of components kept by PCA: " + str(ipca.n_components_))
    pk.dump(ipca, open("pca.pkl","wb"))

    train_pca = np.memmap('train.mmap', dtype='float32', mode='w+', shape=(len(train_dataset), ipca.n_components_))
    for i in range(0, len(train_dataset)//chunk_size):
        train_pca[i*chunk_size: (i+1)*chunk_size] = ipca.transform(train_dataset[i*chunk_size: (i+1)*chunk_size])

    test_pca = np.memmap('test.mmap', dtype='float32', mode='w+', shape=(len(test_dataset), ipca.n_components_))
    for i in range(0, len(test_dataset)//chunk_size):
        test_pca[i*chunk_size: (i+1)*chunk_size] = ipca.transform(test_dataset[i*chunk_size: (i+1)*chunk_size])

    return train_pca, test_pca


def split_data_labels(dataset: tf.data.Dataset, num_samples: int):
    """
    Extract a given number of data samples and their respective labels from a tf.data.Dataset and convert to np.array

    :param dataset: Dataset that the samples are being extracted from
    :param num_samples: Total number of samples desired from dataset
    :return:
    """
    data = list()
    labels = list()

    count = 0
    for instance in dataset.as_numpy_iterator():
        if count == num_samples:
            break
        data.append(instance[0])
        labels.append(instance[1])
        count += 1

    data_array = np.asarray(data)
    labels_array = np.asarray(labels)

    data_nsamples, data_nx, data_ny = data_array.shape
    data_array = data_array.reshape((data_nsamples, data_nx*data_ny))

    return data_array, labels_array