Preprocess

We have three kinds of datasets:

train: for training
dev: for validation
test: for testing (w/o target value)

Dataset

The COVID19Dataset below does:

read .csv files
extract features
split covid.train.csv into train/dev sets
normalize features

Finishing TODO below might make you pass medium baseline.

图片1.png

0.设置seed

myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

1.载入数据

tr_path = 'covid.train.csv'  # path to training data
tt_path = 'covid.test.csv'   # path to testing data

2.处理数据

输入参数：

init(self,path,mode =’train’,target_only = ‘False’):

处理流程：

read .csv files
extract features
split covid.train.csv into train/dev sets
normalize features
Generates a dataset, then is put into a dataloader.
要点：

注意数据规范化，减均值除方差即可；
训练时划分训练和测试集；
train的时候需要shuffle，测试的时候不需要（保证测试结果可复现）；
dataloader相当于一个“字典”，包括数据集合一些参数； ```python
PyTorch
import torch from torch.utils.data import Dataset, DataLoader

For data preprocess

import numpy as np import csv

class COVID19Dataset(Dataset): ‘’’ read .csv files extract features split covid.train.csv into train/dev sets normalize features ‘’’ ‘’’ Dataset for loading and preprocessing the COVID19 dataset ‘’’ def init(self,path,mode =’train’,target_only = ‘False’): self.mode =mode

    with open(path,'r') as fp:
        data = list(csv.reader(fp))
        data = np.array(data[1:])[:,1:].astype(float)
        '''data.shape = 2700*94'''
    if not target_only:
        feats = list(range(93))
    else:
        # TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
        pass
    if mode =='test':
        # Testing data
        # data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
        data = data[:,feats]
        self.data = torch.FloatTensor(data)
    else:
        # Training data (train/dev sets)
        # data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
        target = data[:,-1]
        data = data[:,feats]
        # Splitting training data into train & dev sets
        if mode == 'train':
            indices = [i for i in range(len(data)) if i % 10 != 0]
        elif mode == 'dev':
            indices = [i for i in range(len(data)) if i % 10 == 0]
        # Convert data into PyTorch tensors
        self.data = torch.FloatTensor(data[indices])
        self.target = torch.FloatTensor(target[indices])
    # Normalize features (you may remove this part to see what will happen)
    self.data[:,40:] = \
        (self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \
        / self.data[:, 40:].std(dim=0, keepdim=True)
    self.dim = self.data.shape[1]
    print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'.format(mode, len(self.data), self.dim))
def __getitem__(self, index):
    # Returns one sample at a time
    if self.mode in ['train', 'dev']:
        # For training
        return self.data[index], self.target[index]
    else:
        # For testing (no target)
        return self.data[index]
def __len__(self):
    # Returns the size of the dataset
    return len(self.data)

def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False): ‘’’ Generates a dataset, then is put into a dataloader. ‘’’ dataset = COVID19Dataset(path, mode=mode, target_only=target_only) # Construct dataset dataloader = DataLoader( dataset, batch_size, shuffle=(mode == ‘train’), drop_last=False, num_workers=n_jobs, pin_memory=True) # Construct dataloader return dataloader

<a name="OXhl4"></a>
## 3.建立模型
<a name="T9gb9"></a>
### 输入参数：
input_dim
<a name="WPeII"></a>
### 网络层：
在init中定义网络层；
<a name="LJ8Kb"></a>
### Loss
在init中定义loss；
<a name="xhwAn"></a>
### forward
矩阵相乘
<a name="hIyy5"></a>
### 注意事项：
- BatchNorm1d可以省略正则化和dropout
```python
# PyTorch
import torch.nn as nn
class NeuralNet(nn.Module):
    ''' A simple fully-connected deep neural network '''
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )
        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')
    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        # print(np.shape(self.net(x)))  #[270*1]
        # print(self.net(x))
        # print(np.shape(self.net(x).squeeze(1)))
        # print(self.net(x).squeeze(1))#[270]
        return self.net(x).squeeze(1)
    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        # TODO: you may implement L2 regularization here
        return self.criterion(pred, target)

4.训练并保存参数

训练的一些参数，可以用args

config = {
    'n_epochs': 3000,                # maximum number of epochs
    'batch_size': 270,               # mini-batch size for dataloader
    'optimizer': 'SGD',              # optimization algorithm (optimizer in torch.optim)
    'optim_hparas': {                # hyper-parameters for the optimizer (depends on which optimizer you are using)
        'lr': 0.001,                 # learning rate of SGD
        'momentum': 0.9              # momentum for SGD
    },
    'early_stop': 200,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': 'models/model.pth'  # your model will be saved here
}

设置优化器
开始训练

def train(tr_set, dv_set, model, config, device):
    ''' DNN training '''
    n_epochs = config['n_epochs']  # Maximum number of epochs
    # Setup optimizer
    optimizer = getattr(torch.optim, config['optimizer'])(
        model.parameters(), **config['optim_hparas'])
    min_mse = 1000.
    loss_record = {'train': [], 'dev': []}      # for recording training loss
    early_stop_cnt = 0
    epoch = 0
    while epoch < n_epochs:
        model.train()                           # set model to training mode
        for x, y in tr_set:                     # iterate through the dataloader
            optimizer.zero_grad()               # set gradient to zero
            x, y = x.to(device), y.to(device)   # move data to device (cpu/cuda)
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
            mse_loss.backward()                 # compute gradient (backpropagation)
            optimizer.step()                    # update model with optimizer
            loss_record['train'].append(mse_loss.detach().cpu().item())
        # After each epoch, test your model on the validation (development) set.
        dev_mse = dev(dv_set, model, device)
        if dev_mse < min_mse:
            # Save model if your model improved
            min_mse = dev_mse
            print('Saving model (epoch = {:4d}, loss = {:.4f})'
                  .format(epoch + 1, min_mse))
            torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1
        epoch += 1
        loss_record['dev'].append(dev_mse)
        if early_stop_cnt > config['early_stop']:
            # Stop training if your model stops improving for "config['early_stop']" epochs.
            break
    print('Finished training after {} epochs'.format(epoch))
    return min_mse, loss_record

验证并保存模型
covid-19.zip

210 - DL | 深度学习

2151 - Pytorch 一个DNN的例子