PPCA-AIPacMan-2024/machinelearning/backend.py

import collections
import os
import time

import matplotlib.pyplot as plt
import numpy as np

from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader

use_graphics = True

def maybe_sleep_and_close(seconds):
    if use_graphics and plt.get_fignums():
        time.sleep(seconds)
        for fignum in plt.get_fignums():
            fig = plt.figure(fignum)
            plt.close(fig)
            try:
                # This raises a TclError on some Windows machines
                fig.canvas.start_event_loop(1e-3)
            except:
                pass

def get_data_path(filename):
    path = os.path.join(
        os.path.dirname(__file__), os.pardir, "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), filename)
    if not os.path.exists(path):
        raise Exception("Could not find data file: {}".format(filename))
    return path

class CustomDataset(Dataset):
    def __init__(self, x, y, transform=None):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.transform = transform

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        sample = {'x': x, 'label': y}

        if self.transform:
            sample = self.transform(sample)

        return sample

    def get_validation_accuracy(self):
        raise NotImplementedError(
            "No validation data is available for this dataset. "
            "In this assignment, only the Digit Classification and Language "
            "Identification datasets have validation data.")

class PerceptronDataset(CustomDataset):
    def __init__(self, model):
        points = 500
        x = np.hstack([np.random.randn(points, 2), np.ones((points, 1))])
        y = np.where(x[:, 0] + 2 * x[:, 1] - 1 >= 0, 1.0, -1.0)
        super().__init__(x, np.expand_dims(y, axis=1))

        self.model = model
        self.epoch = 0

        if use_graphics:
            fig, ax = plt.subplots(1, 1)
            limits = np.array([-3.0, 3.0])
            ax.set_xlim(limits)
            ax.set_ylim(limits)
            positive = ax.scatter(*x[y == 1, :-1].T, color="red", marker="+")
            negative = ax.scatter(*x[y == -1, :-1].T, color="blue", marker="_")
            line, = ax.plot([], [], color="black")
            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
            ax.legend([positive, negative], [1, -1])
            plt.show(block=False)

            self.fig = fig
            self.limits = limits
            self.line = line
            self.text = text
            self.last_update = time.time()

    def __getitem__(self, idx):
        self.epoch += 1

        if torch.is_tensor(idx):
            idx = idx.tolist()

        x = self.x[idx]
        y = self.y[idx]

        if use_graphics and time.time() - self.last_update > 0.01:
            w = self.model.get_weights().data.flatten()
            limits = self.limits
            if w[1] != 0:
                self.line.set_data(limits, (-w[0] * limits - w[2]) / w[1])
            elif w[0] != 0:
                self.line.set_data(np.full(2, -w[2] / w[0]), limits)
            else:
                self.line.set_data([], [])
            self.text.set_text(
                "epoch: {:,}\npoint: {:,}/{:,}\nweights: {}".format(
                    self.epoch, idx * 1 + 1, len(self.x), w))
            self.fig.canvas.draw_idle()
            self.fig.canvas.start_event_loop(1e-3)
            self.last_update = time.time()

        return {'x': x, 'label': y}

class RegressionDataset(CustomDataset):
    def __init__(self, model):
        x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
        np.random.RandomState(0).shuffle(x)
        self.argsort_x = np.argsort(x.flatten())
        y = np.sin(x)
        super().__init__(x, y)

        self.model = model
        self.processed = 0

        if use_graphics:
            fig, ax = plt.subplots(1, 1)
            ax.set_xlim(-2 * np.pi, 2 * np.pi)
            ax.set_ylim(-1.4, 1.4)
            real, = ax.plot(x[self.argsort_x], y[self.argsort_x], color="blue")
            learned, = ax.plot([], [], color="red")
            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
            ax.legend([real, learned], ["real", "learned"])
            plt.show(block=False)

            self.fig = fig
            self.learned = learned
            self.text = text
            self.last_update = time.time()

    def __getitem__(self, idx):
        data = super().__getitem__(idx)
        x = data['x']
        y = data['label']

        self.processed += 1

        if use_graphics and time.time() - self.last_update > 0.1:
            predicted = self.model(torch.tensor(self.x, dtype=torch.float32)).data
            loss = self.model.get_loss(x, y).data
            self.learned.set_data(self.x[self.argsort_x], predicted[self.argsort_x])
            self.text.set_text("processed: {:,}\nloss: {:.6f}".format(
                self.processed, loss))
            self.fig.canvas.draw_idle()
            self.fig.canvas.start_event_loop(1e-3)
            self.last_update = time.time()

        return {'x': x, 'label': y}

class DigitClassificationDataset(CustomDataset):
    def __init__(self, model):
        mnist_path = get_data_path("mnist.npz")

        with np.load(mnist_path) as data:
            train_images = data["train_images"]
            train_labels = data["train_labels"]
            test_images = data["test_images"]
            test_labels = data["test_labels"]
            assert len(train_images) == len(train_labels) == 60000
            assert len(test_images) == len(test_labels) == 10000
            self.dev_images = test_images[0::2]
            self.dev_labels = test_labels[0::2]
            self.test_images = test_images[1::2]
            self.test_labels = test_labels[1::2]

        train_labels_one_hot = np.zeros((len(train_images), 10))
        train_labels_one_hot[range(len(train_images)), train_labels] = 1

        super().__init__(train_images, train_labels_one_hot)

        self.model = model
        self.epoch = 0
        self.num_items = 0

        if use_graphics:
            self.current_accuracy = None
            width = 20  # Width of each row expressed as a multiple of image width
            samples = 100  # Number of images to display per label
            fig = plt.figure()
            ax = {}
            images = collections.defaultdict(list)
            texts = collections.defaultdict(list)
            for i in reversed(range(10)):
                ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1,
                                         sharex=ax.get(9))
                plt.setp(ax[i].get_xticklabels(), visible=i == 9)
                ax[i].set_yticks([])
                ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes,
                           va="center")
                ax[i].set_xlim(0, 28 * width)
                ax[i].set_ylim(0, 28)
                for j in range(samples):
                    images[i].append(ax[i].imshow(
                        np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens",
                        alpha=0.3))
                    texts[i].append(ax[i].text(
                        0, 0, "", ha="center", va="top", fontsize="smaller"))
            ax[9].set_xticks(np.linspace(0, 28 * width, 11))
            ax[9].set_xticklabels(
                ["{:.1f}".format(num) for num in np.linspace(0, 1, 11)])
            ax[9].tick_params(axis="x", pad=16)
            ax[9].set_xlabel("Probability of Correct Label")
            status = ax[0].text(
                0.5, 1.5, "", transform=ax[0].transAxes, ha="center",
                va="bottom")
            plt.show(block=False)

            self.width = width
            self.samples = samples
            self.fig = fig
            self.images = images
            self.texts = texts
            self.status = status
            self.last_update = time.time()

    def __getitem__(self, idx):
        data = super().__getitem__(idx)
        x = data['x']
        y = data['label']

        if use_graphics and time.time() - self.last_update > 1:
            dev_logits = self.model.run(torch.tensor(self.dev_images, dtype=torch.float32)).detach().cpu()
            dev_predicted = torch.argmax(dev_logits, axis=1)
            dev_probs = torch.exp(nn.functional.log_softmax(dev_logits, dim=1))

            dev_accuracy = torch.mean(torch.eq(dev_predicted, torch.tensor(self.dev_labels)).float())
            self.status.set_text(
                "validation accuracy: {:.2%}".format(dev_accuracy))
            for i in range(10):
                predicted = dev_predicted[self.dev_labels == i]
                probs = dev_probs[self.dev_labels == i][:, i]
                linspace = np.linspace(0, len(probs) - 1, self.samples).astype(int)
                indices = probs.argsort()[linspace]
                for j, (prob, image) in enumerate(zip(probs[indices], self.dev_images[self.dev_labels == i][indices])):
                    self.images[i][j].set_data(image.reshape((28, 28)))
                    left = prob * (self.width - 1) * 28
                    if predicted[indices[j]] == i:
                        self.images[i][j].set_cmap("Greens")
                        self.texts[i][j].set_text("")
                    else:
                        self.images[i][j].set_cmap("Reds")
                        self.texts[i][j].set_text(predicted[indices[j]].detach().cpu().numpy())
                        self.texts[i][j].set_x(left + 14)
                    self.images[i][j].set_extent([left, left + 28, 0, 28])
            self.fig.canvas.draw_idle()
            self.fig.canvas.start_event_loop(1e-3)
            self.last_update = time.time()

        return {'x': x, 'label': y}

    def get_validation_accuracy(self):
        dev_logits = self.model.run(torch.tensor(self.dev_images, dtype=torch.float32)).data
        dev_predicted = torch.argmax(dev_logits, axis=1).detach()
        dev_accuracy = (dev_predicted == self.dev_labels).mean()
        return dev_accuracy

class LanguageIDDataset(CustomDataset):
    def __init__(self, model):
        self.model = model

        data_path = get_data_path("lang_id.npz")

        with np.load(data_path) as data:
            self.chars = data['chars']
            self.language_codes = data['language_codes']
            self.language_names = data['language_names']
            self.train_x = data['train_x']
            self.train_y = data['train_y']
            self.train_buckets = data['train_buckets']
            self.dev_x = data['dev_x']
            self.dev_y = data['dev_y']
            self.dev_buckets = data['dev_buckets']
            self.test_x = data['test_x']
            self.test_y = data['test_y']
            self.test_buckets = data['test_buckets']

        self.epoch = 0
        self.bucket_weights = self.train_buckets[:, 1] - self.train_buckets[:, 0]
        self.bucket_weights = self.bucket_weights / float(self.bucket_weights.sum())

        self.chars_print = self.chars
        try:
            print(u"Alphabet: {}".format(u"".join(self.chars)))
        except UnicodeEncodeError:
            self.chars_print = "abcdefghijklmnopqrstuvwxyzaaeeeeiinoouuacelnszz"
            print("Alphabet: " + self.chars_print)
            self.chars_print = list(self.chars_print)
            print("""
NOTE: Your terminal does not appear to support printing Unicode characters.
For the purposes of printing to the terminal, some of the letters in the
alphabet above have been substituted with ASCII symbols.""".strip())
        print("")

        # Select some examples to spotlight in the monitoring phase (3 per language)
        spotlight_idxs = []
        for i in range(len(self.language_names)):
            idxs_lang_i = np.nonzero(self.dev_y == i)[0]
            idxs_lang_i = np.random.choice(idxs_lang_i, size=3, replace=False)
            spotlight_idxs.extend(list(idxs_lang_i))
        self.spotlight_idxs = np.array(spotlight_idxs, dtype=int)

        # Templates for printing updates as training progresses
        max_word_len = self.dev_x.shape[1]
        max_lang_len = max([len(x) for x in self.language_names])

        self.predicted_template = u"Pred: {:<NUM}".replace('NUM', str(max_lang_len))

        self.word_template = u"  "
        self.word_template += u"{:<NUM} ".replace('NUM', str(max_word_len))
        self.word_template += u"{:<NUM} ({:6.1%})".replace('NUM', str(max_lang_len))
        self.word_template += u" {:<NUM} ".replace('NUM', str(max_lang_len + len('Pred: ')))
        for i in range(len(self.language_names)):
            self.word_template += u"|{}".format(self.language_codes[i])
            self.word_template += "{probs[" + str(i) + "]:4.0%}"

        self.last_update = time.time()

    def __len__(self):
        return len(self.train_x)

    def _encode(self, inp_x, inp_y):
        xs = []
        for i in range(inp_x.shape[1]):
            if np.all(np.array(inp_x[:, i]) == -1):
                break
            assert not np.any(np.array(inp_x[:, i]) == -1), (
                "Please report this error in the project: batching by length was done incorrectly in the provided code")
            x = np.eye(len(self.chars))[np.array(inp_x[:, i], dtype=int)]
            xs.append(x)
        y = np.eye(len(self.language_names))[inp_y]
        j = [[0 for _ in range(47)]]

        if len(inp_x) == 1:
            return nn.functional.pad(torch.tensor(xs, dtype=torch.float), (0, 0, 0, 0, 0, 10 - len(xs))), torch.tensor(y, dtype=torch.float)

        return torch.tensor(xs, dtype=torch.float), torch.tensor(y, dtype=torch.float)

    def _softmax(self, x):
        exp = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp / np.sum(exp, axis=-1, keepdims=True)

    def _predict(self, split='test'):
        if split == 'dev':
            data_x = self.dev_x
            data_y = self.dev_y
            buckets = self.dev_buckets
        else:
            data_x = self.test_x
            data_y = self.test_y
            buckets = self.test_buckets

        all_predicted = []
        all_correct = []
        for bucket_id in range(buckets.shape[0]):
            start, end = buckets[bucket_id]
            xs, y = self._encode(data_x[start:end], data_y[start:end])
            predicted = self.model.run(xs)

            all_predicted.extend(list(predicted.data))
            all_correct.extend(list(data_y[start:end]))
        all_predicted_probs = [nn.functional.softmax(torch.tensor(i), dim=-1) for i in all_predicted]

        all_predicted = [i.argmax() for i in all_predicted_probs]
        all_correct = np.asarray(all_correct)

        return all_predicted_probs, all_predicted, all_correct

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        ret = self._encode(self.train_x[idx:idx+1], self.train_y[idx:idx+1])
        return {'x': torch.squeeze(ret[0]), 'label': torch.squeeze(ret[1])}

    def get_validation_accuracy(self):
        dev_predicted_probs, dev_predicted, dev_correct = self._predict('dev')
        dev_accuracy = (torch.tensor(dev_predicted) == torch.tensor(dev_correct)).float().mean().item()
        return dev_accuracy

class DigitClassificationDataset2(CustomDataset):
    def __init__(self, model):
        mnist_path = get_data_path("mnist.npz")
        training_size = 200
        test_size = 100
        with np.load(mnist_path) as data:
            train_images = data["train_images"][:training_size]
            train_labels = data["train_labels"][:training_size]
            test_images = data["train_images"][:test_size]
            test_labels = data["train_labels"][:test_size]
            assert len(train_images) == len(train_labels) == training_size
            assert len(test_images) == len(test_labels) == test_size
            self.dev_images = test_images[0::2]
            self.dev_labels = test_labels[0::2]
            self.test_images = test_images[1::2]
            self.test_labels = test_labels[1::2]

        train_labels_one_hot = np.zeros((len(train_images), 10))
        train_labels_one_hot[range(len(train_images)), train_labels] = 1

        super().__init__(train_images, train_labels_one_hot)

        self.model = model
        self.epoch = 0
        self.num_items = 0

        if use_graphics:
            self.current_accuracy = None
            width = 20  # Width of each row expressed as a multiple of image width
            samples = 100  # Number of images to display per label
            fig = plt.figure()
            ax = {}
            images = collections.defaultdict(list)
            texts = collections.defaultdict(list)
            for i in reversed(range(10)):
                ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1, sharex=ax.get(9))
                plt.setp(ax[i].get_xticklabels(), visible=i == 9)
                ax[i].set_yticks([])
                ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes, va="center")
                ax[i].set_xlim(0, 28 * width)
                ax[i].set_ylim(0, 28)
                for j in range(samples):
                    images[i].append(ax[i].imshow(np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens", alpha=0.3))
                    texts[i].append(ax[i].text(0, 0, "", ha="center", va="top", fontsize="smaller"))
            ax[9].set_xticks(np.linspace(0, 28 * width, 11))
            ax[9].set_xticklabels(["{:.1f}".format(num) for num in np.linspace(0, 1, 11)])
            ax[9].tick_params(axis="x", pad=16)
            ax[9].set_xlabel("Probability of Correct Label")
            status = ax[0].text(0.5, 1.5, "", transform=ax[0].transAxes, ha="center", va="bottom")
            plt.show(block=False)

            self.width = width
            self.samples = samples
            self.fig = fig
            self.images = images
            self.texts = texts
            self.status = status
            self.last_update = time.time()

    def __getitem__(self, idx):
        data = super().__getitem__(idx)
        x = data['x']
        y = data['label']

        if use_graphics and time.time() - self.last_update > 1:
            dev_logits = self.model.run(torch.tensor(self.dev_images, dtype=torch.float32)).detach().cpu()
            dev_predicted = torch.argmax(dev_logits, axis=1)
            dev_probs = torch.exp(nn.functional.log_softmax(dev_logits, dim=1))

            dev_accuracy = torch.mean(torch.eq(dev_predicted, torch.tensor(self.dev_labels)).float())
            self.status.set_text("validation accuracy: {:.2%}".format(dev_accuracy))
            for i in range(10):
                predicted = dev_predicted[self.dev_labels == i]
                probs = dev_probs[self.dev_labels == i][:, i]
                linspace = np.linspace(0, len(probs) - 1, self.samples).astype(int)
                indices = probs.argsort()[linspace]
                for j, (prob, image) in enumerate(zip(probs[indices], self.dev_images[self.dev_labels == i][indices])):
                    self.images[i][j].set_data(image.reshape((28, 28)))
                    left = (prob * (self.width - 1) * 28).detach().cpu().numpy()
                    if predicted[indices[j]] == i:
                        self.images[i][j].set_cmap("Greens")
                        self.texts[i][j].set_text("")
                    else:
                        self.images[i][j].set_cmap("Reds")
                        self.texts[i][j].set_text(predicted[indices[j]].detach().cpu().numpy())
                        self.texts[i][j].set_x(left + 14)
                    self.images[i][j].set_extent([left, left + 28, 0, 28])
            self.fig.canvas.draw_idle()
            self.fig.canvas.start_event_loop(1e-3)
            self.last_update = time.time()

        return {'x': x, 'label': y}

    def get_validation_accuracy(self):
        dev_logits = self.model.run(torch.tensor(self.dev_images, dtype=torch.float32)).data
        dev_predicted = torch.argmax(dev_logits, axis=1).detach()
        dev_accuracy = torch.mean(torch.eq(dev_predicted, torch.tensor(self.dev_labels)).float())
        return dev_accuracy

def main():
    import models
    model = models.PerceptronModel(3)
    dataset = PerceptronDataset(model)
    model.train(dataset)

    model = models.RegressionModel()
    dataset = RegressionDataset(model)
    model.train(dataset)

    model = models.DigitClassificationModel()
    dataset = DigitClassificationDataset(model)
    model.train(dataset)

    model = models.LanguageIDModel()
    dataset = LanguageIDDataset(model)
    model.train(dataset)

if __name__ == "__main__":
    main()