diff --git a/Group4Presentation.pdf b/Group4Presentation.pdf new file mode 100644 index 0000000..fa8c338 Binary files /dev/null and b/Group4Presentation.pdf differ diff --git a/HomeworkGroup4.pdf b/HomeworkGroup4.pdf new file mode 100644 index 0000000..c9d3586 Binary files /dev/null and b/HomeworkGroup4.pdf differ diff --git a/MLOspiderboost.py b/MLOspiderboost.py new file mode 100644 index 0000000..814fd0f --- /dev/null +++ b/MLOspiderboost.py @@ -0,0 +1,218 @@ +from __future__ import print_function +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +import numpy as np +from matplotlib import pyplot as plt + +class LeNet5(nn.Module): + + def __init__(self): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5, 1) + self.conv2 = nn.Conv2d(6, 16, 5, 1) + self.fc1 = nn.Linear(256, 120) + self.fc2 = nn.Linear(120,84) + self.fc3 = nn.Linear(84,10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 256) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return F.log_softmax(x, dim=1) + +class Net_FC(nn.Module): + def __init__(self): + super(Net_FC, self).__init__() + self.fc1 = nn.Linear(784, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = x.view(-1, 784) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def train_momentum_spiderboost(model, device, train_loader_xi, optimizer, epoch, b_sz): + model.train() + + q = 64 + B = 64 + eta = 0.05 + v2 = {} + v = {} + v3 = {} + iter = 0 + lmbda = 0.00001 + for batch_idx, (data, target) in enumerate(train_loader_xi): + + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + + if iter % q == 0: + for name, p in model.named_parameters(): + v[name] = p.grad.data + v2[name] = p.data + else: + #v1 is storing gradients, v2 i storing old_x and v3 is storing current_x + v1 = {} + for p in model.parameters(): + p.grad.data.add_(p.grad.data) + + for name, p in model.named_parameters(): + v1[name] = p.grad.data + v3[name] = p.data + p.data.copy_(v2[name]) + + data = data.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + + for p in model.parameters(): + p.grad.data.add_(p.grad.data) + + for name, p in model.named_parameters(): + v1[name].add_(-p.grad.data) + v1[name].add_(v[name]) + v = v1 + + for name, p in model.named_parameters(): + v[name].mul_(1/B) + + for name, p in model.named_parameters(): + p.data.copy_(v3[name]) + + data = data.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + + for name, p in model.named_parameters(): + p.data.add_(-eta*v[name]) + p.data = torch.sign(p.data) * torch.maximum(torch.abs(p.data)-lmbda,\ + torch.zeros(p.data.size())) + v2 = v3 + + iter +=1 + + if batch_idx % 200 == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader_xi.dataset), + 100. * batch_idx / len(train_loader_xi), loss.item())) + +def test(model, device, test_loader,y,epoch): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + y[epoch] = 100. * correct / len(test_loader.dataset) + +def modelSparsity(model): + nonzeros = 0 + parameterCount = 0 + + for p in model.parameters(): + for data in torch.flatten(p.data): + if(data!=0):nonzeros+=1 + parameterCount += 1 + + return 1 - (nonzeros/parameterCount) + +def stationarityViolation(model): + lmbda = 0.00001 + gradient = torch.tensor(()) + for p in model.parameters(): + data = torch.flatten(p.data) + grad = torch.flatten(p.grad.data) + gradient = torch.cat((gradient,(torch.sign(data-grad)*torch.maximum\ + (torch.zeros(data.size()), torch.abs(data-grad)-lmbda)))) + return float(torch.linalg.norm(grad)) + +def main(): + # Training settings + use_cuda = False # if your machine has CUDA-compatible GPU, you can change this to True + + torch.manual_seed(20200930) + + device = torch.device("cuda" if use_cuda else "cpu") + + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + + train_loader_xi = torch.utils.data.DataLoader( + datasets.FashionMNIST('data', train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=64, shuffle=True, **kwargs) + + + test_loader = torch.utils.data.DataLoader( + datasets.FashionMNIST('data', train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=1000, shuffle=True, **kwargs) + + #model = Net_FC().to(device) + + model = LeNet5().to(device) + + optimizer = optim.SGD(model.parameters(), lr=0.01) + + iter = 0 + + b_sz = 64 + + totalEpochs = 15 + x = np.zeros(totalEpochs) + y = np.zeros(totalEpochs) + z = np.zeros(totalEpochs) + w = np.zeros(totalEpochs) + for epoch in range(totalEpochs): + #train(model, device, train_loader, optimizer, epoch, iter) + train_momentum_spiderboost(model, device, + train_loader_xi, optimizer, epoch, b_sz) + test(model, device, test_loader,y,epoch) + iter += 60000//64 + + x[epoch] = epoch + z[epoch] = modelSparsity(model) + w[epoch] = stationarityViolation(model) + + fig, axs = plt.subplots(2,1) + fig.suptitle("Accuracy, sparsity and violation") + axs[0].plot(x, y, '+') + plt.ylim(75,90) + axs[1].plot(x, z, '*') + #axs[2].plot(x, w, '+') + + +if __name__ == '__main__': + main()