{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Import and data processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import all necessary libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "from torch.utils.data import DataLoader, TensorDataset, random_split" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the training data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data = np.load(\"./data.npy\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define constants that describe the data and model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "CONTEXT_SIZE = 10\n", "ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n", "ALPHABET_SIZE = len(ALPHABET)\n", "TRAINING_DATA_SIZE = 0.9\n", "\n", "# +1 is for unknown characters\n", "VOCAB_SIZE = ALPHABET_SIZE + 1\n", "\n", "EMBEDDING_DIM = 10\n", "\n", "INPUT_SEQ_LEN = CONTEXT_SIZE\n", "OUTPUT_SIZE = VOCAB_SIZE\n", "\n", "BATCH_SIZE = 2048\n", "\n", "EPOCHS = 30\n", "LEARNING_RATE = 1e-3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Process the data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Input: embeddings of the previous 10 letters\n", "# shape: (num_samples, CONTEXT_SIZE)\n", "X = data[:, :CONTEXT_SIZE]\n", "\n", "# Target: current letter index\n", "# shape: (num_samples,)\n", "y = data[:, CONTEXT_SIZE]\n", "\n", "# Torch dataset (important: use long/int64 for indices)\n", "X_tensor = torch.tensor(X, dtype=torch.long) # for nn.Embedding\n", "y_tensor = torch.tensor(y, dtype=torch.long) # for classification target\n", "\n", "dataset = TensorDataset(X_tensor, y_tensor)\n", "\n", "train_len = int(TRAINING_DATA_SIZE * len(dataset))\n", "train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])\n", "\n", "train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)\n", "test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Model" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "class Logistic(nn.Module):\n", " def __init__(self, *, embedding_count: int, embedding_dimension_size: int, context_size: int, output_shape: int):\n", " super().__init__()\n", " self.embedding = nn.Embedding(num_embeddings=embedding_count, embedding_dim=embedding_dimension_size)\n", " self.linear = nn.Linear(context_size * embedding_dimension_size, output_shape)\n", "\n", " def forward(self, x):\n", " embedded = self.embedding(x) # (BATCH_SIZE, CONTEXT_SIZE, EMBEDDING_DIM)\n", " flattened = embedded.view(x.size(0), -1) # (BATCH_SIZE, CONTEXT_SIZE * EMBEDDING_DIM)\n", " return self.linear(flattened) # (BATCH_SIZE, OUTPUT_SIZE)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using device: cpu\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/thastertyn/Code/Skola/4-rocnik/programove-vybaveni/omega/.venv/lib/python3.12/site-packages/torch/cuda/__init__.py:129: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.)\n", " return torch._C._cuda_getDeviceCount() > 0\n" ] } ], "source": [ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "print(f\"Using device: {device}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Training" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create fresh instance of the model" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "model = Logistic(\n", " embedding_count=VOCAB_SIZE, # e.g., 27 for a–z + unknown\n", " embedding_dimension_size=EMBEDDING_DIM, # e.g., 10\n", " context_size=CONTEXT_SIZE, # e.g., 10\n", " output_shape=OUTPUT_SIZE # e.g., 27 (next character)\n", ").to(device)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "criterion = nn.CrossEntropyLoss()\n", "optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Epoch 1] - Loss: 2.5968 | Accuracy: 23.06%\n", "[Epoch 2] - Loss: 2.3218 | Accuracy: 30.02%\n", "[Epoch 3] - Loss: 2.2600 | Accuracy: 31.25%\n", "[Epoch 4] - Loss: 2.2325 | Accuracy: 31.55%\n", "[Epoch 5] - Loss: 2.2171 | Accuracy: 31.75%\n", "[Epoch 6] - Loss: 2.2076 | Accuracy: 31.98%\n", "[Epoch 7] - Loss: 2.2006 | Accuracy: 32.22%\n", "[Epoch 8] - Loss: 2.1962 | Accuracy: 32.36%\n", "[Epoch 9] - Loss: 2.1925 | Accuracy: 32.42%\n", "[Epoch 10] - Loss: 2.1900 | Accuracy: 32.48%\n", "[Epoch 11] - Loss: 2.1876 | Accuracy: 32.54%\n", "[Epoch 12] - Loss: 2.1859 | Accuracy: 32.64%\n", "[Epoch 13] - Loss: 2.1847 | Accuracy: 32.65%\n", "[Epoch 14] - Loss: 2.1833 | Accuracy: 32.76%\n", "[Epoch 15] - Loss: 2.1821 | Accuracy: 32.75%\n", "[Epoch 16] - Loss: 2.1813 | Accuracy: 32.74%\n", "[Epoch 17] - Loss: 2.1806 | Accuracy: 32.84%\n", "[Epoch 18] - Loss: 2.1799 | Accuracy: 32.81%\n", "[Epoch 19] - Loss: 2.1792 | Accuracy: 32.80%\n", "[Epoch 20] - Loss: 2.1786 | Accuracy: 32.81%\n", "[Epoch 21] - Loss: 2.1780 | Accuracy: 32.77%\n", "[Epoch 22] - Loss: 2.1776 | Accuracy: 32.85%\n", "[Epoch 23] - Loss: 2.1770 | Accuracy: 32.81%\n", "[Epoch 24] - Loss: 2.1767 | Accuracy: 32.81%\n", "[Epoch 25] - Loss: 2.1764 | Accuracy: 32.81%\n", "[Epoch 26] - Loss: 2.1757 | Accuracy: 32.80%\n", "[Epoch 27] - Loss: 2.1755 | Accuracy: 32.81%\n", "[Epoch 28] - Loss: 2.1751 | Accuracy: 32.79%\n", "[Epoch 29] - Loss: 2.1748 | Accuracy: 32.82%\n", "[Epoch 30] - Loss: 2.1744 | Accuracy: 32.80%\n" ] } ], "source": [ "for epoch in range(EPOCHS):\n", " model.train()\n", " total_loss = 0\n", " correct = 0\n", " total = 0\n", "\n", " for batch_X, batch_y in train_loader:\n", " batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n", "\n", " optimizer.zero_grad()\n", " logits = model(batch_X) # shape: (BATCH_SIZE, OUTPUT_SIZE)\n", " loss = criterion(logits, batch_y)\n", " loss.backward()\n", " optimizer.step()\n", "\n", " total_loss += loss.item() * batch_X.size(0)\n", "\n", " # Compute accuracy\n", " preds = torch.argmax(logits, dim=1)\n", " correct += (preds == batch_y).sum().item()\n", " total += batch_X.size(0)\n", "\n", " avg_loss = total_loss / total\n", " accuracy = correct / total * 100\n", " print(f\"[Epoch {epoch+1}] - Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%\")\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 1 prediction accuracy: 32.45%\n", "Top 3 prediction accuracy: 58.55%\n", "Top 5 prediction accuracy: 72.66%\n" ] } ], "source": [ "model.eval()\n", "correct_top1 = 0\n", "correct_top3 = 0\n", "correct_top5 = 0\n", "total = 0\n", "\n", "with torch.no_grad():\n", " for batch_X, batch_y in test_loader:\n", " batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n", " outputs = model(batch_X)\n", "\n", " _, top_preds = outputs.topk(5, dim=1)\n", "\n", " for true, top5 in zip(batch_y, top_preds):\n", " total += 1\n", " if true == top5[0]:\n", " correct_top1 += 1\n", " if true in top5[:3]:\n", " correct_top3 += 1\n", " if true in top5:\n", " correct_top5 += 1\n", "\n", "top1_acc = correct_top1 / total\n", "top3_acc = correct_top3 / total\n", "top5_acc = correct_top5 / total\n", "\n", "print(f\"Top 1 prediction accuracy: {(top1_acc * 100):.2f}%\")\n", "print(f\"Top 3 prediction accuracy: {(top3_acc * 100):.2f}%\")\n", "print(f\"Top 5 prediction accuracy: {(top5_acc * 100):.2f}%\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }