Numeric Dataset

Generic dataset for numbers
Pytorch
ML
Author

Johannes Tomasoni

Published

February 2, 2023

About

A basic Dataset for numeric features from a Pandas dataframe. If there is enough memory the dataframe can be converted into a FloatTensor to speed up data access.

Dataframe consists of feature columns and target columns and a kfold column.

class NumDataset(torch.utils.data.Dataset):
    
    def __init__(self, df, features = [], targets: list = None, kfolds=[0]):
        
        if targets:
            self.targets = torch.FloatTensor(df[df['kfold'].isin(kfolds)][targets].values.reshape(-1,len(targets)))
        else:
            self.targets = torch.zeros(len(targets))
        
        # Convert Dataframe into FloatTensor for speed up
        self.df = torch.FloatTensor(df[features].values.reshape(-1,len(features)))
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, item):
        
        return self.df[item], self.targets[item]
        

For dict usage in Module.forward define dataset like this:

....    
    def __getitem__(self, item):
        
        return {'data': self.df[item], 'targets': self.targets[item]}
....

class MyModel(nn.Module):
  ....
  def forward(self, data, targets):
    ....

DataLoader in Fastai:

train_ds = NumDataset(df = train, features = FEATURES, targets=TARGETS, kfolds = [1,2,3,4])
valid_ds = NumDataset(df = train, features = FEATURES, targets=TARGETS, kfolds = [0])

train_dl = DataLoader(train_ds, batch_size= BS_TRAIN, shuffle = True, num_workers = WORKERS) 
valid_dl = DataLoader(valid_ds, batch_size= BS_TEST, shuffle = False, num_workers = WORKERS)

data = DataLoaders(train_dl, valid_dl).cuda() 

Training in Fastai:

learn = Learner(data, MYMODEL, loss_func=MYLOSS, metrics=MYMETRICS)

learn.fit_one_cycle(1,1e-3)