omega/model/logistic.ipynb

340 lines
9.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Import and data processing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import all necessary libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import torch\n",
"import torch.nn as nn\n",
"from torch.utils.data import DataLoader, TensorDataset, random_split"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the training data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data = np.load(\"./data.npy\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define constants that describe the data and model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"CONTEXT_SIZE = 10\n",
"ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
"ALPHABET_SIZE = len(ALPHABET)\n",
"TRAINING_DATA_SIZE = 0.9\n",
"\n",
"# +1 is for unknown characters\n",
"VOCAB_SIZE = ALPHABET_SIZE + 1\n",
"\n",
"EMBEDDING_DIM = 10\n",
"\n",
"INPUT_SEQ_LEN = CONTEXT_SIZE\n",
"OUTPUT_SIZE = VOCAB_SIZE\n",
"\n",
"BATCH_SIZE = 2048\n",
"\n",
"EPOCHS = 30\n",
"LEARNING_RATE = 1e-3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Process the data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Input: embeddings of the previous 10 letters\n",
"# shape: (num_samples, CONTEXT_SIZE)\n",
"X = data[:, :CONTEXT_SIZE]\n",
"\n",
"# Target: current letter index\n",
"# shape: (num_samples,)\n",
"y = data[:, CONTEXT_SIZE]\n",
"\n",
"# Torch dataset (important: use long/int64 for indices)\n",
"X_tensor = torch.tensor(X, dtype=torch.long) # for nn.Embedding\n",
"y_tensor = torch.tensor(y, dtype=torch.long) # for classification target\n",
"\n",
"dataset = TensorDataset(X_tensor, y_tensor)\n",
"\n",
"train_len = int(TRAINING_DATA_SIZE * len(dataset))\n",
"train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])\n",
"\n",
"train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)\n",
"test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Model"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"class Logistic(nn.Module):\n",
" def __init__(self, *, embedding_count: int, embedding_dimension_size: int, context_size: int, output_shape: int):\n",
" super().__init__()\n",
" self.embedding = nn.Embedding(num_embeddings=embedding_count, embedding_dim=embedding_dimension_size)\n",
" self.linear = nn.Linear(context_size * embedding_dimension_size, output_shape)\n",
"\n",
" def forward(self, x):\n",
" embedded = self.embedding(x) # (BATCH_SIZE, CONTEXT_SIZE, EMBEDDING_DIM)\n",
" flattened = embedded.view(x.size(0), -1) # (BATCH_SIZE, CONTEXT_SIZE * EMBEDDING_DIM)\n",
" return self.linear(flattened) # (BATCH_SIZE, OUTPUT_SIZE)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cpu\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/thastertyn/Code/Skola/4-rocnik/programove-vybaveni/omega/.venv/lib/python3.12/site-packages/torch/cuda/__init__.py:129: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.)\n",
" return torch._C._cuda_getDeviceCount() > 0\n"
]
}
],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"print(f\"Using device: {device}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Training"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create fresh instance of the model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"model = Logistic(\n",
" embedding_count=VOCAB_SIZE, # e.g., 27 for az + unknown\n",
" embedding_dimension_size=EMBEDDING_DIM, # e.g., 10\n",
" context_size=CONTEXT_SIZE, # e.g., 10\n",
" output_shape=OUTPUT_SIZE # e.g., 27 (next character)\n",
").to(device)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Epoch 1] - Loss: 2.5968 | Accuracy: 23.06%\n",
"[Epoch 2] - Loss: 2.3218 | Accuracy: 30.02%\n",
"[Epoch 3] - Loss: 2.2600 | Accuracy: 31.25%\n",
"[Epoch 4] - Loss: 2.2325 | Accuracy: 31.55%\n",
"[Epoch 5] - Loss: 2.2171 | Accuracy: 31.75%\n",
"[Epoch 6] - Loss: 2.2076 | Accuracy: 31.98%\n",
"[Epoch 7] - Loss: 2.2006 | Accuracy: 32.22%\n",
"[Epoch 8] - Loss: 2.1962 | Accuracy: 32.36%\n",
"[Epoch 9] - Loss: 2.1925 | Accuracy: 32.42%\n",
"[Epoch 10] - Loss: 2.1900 | Accuracy: 32.48%\n",
"[Epoch 11] - Loss: 2.1876 | Accuracy: 32.54%\n",
"[Epoch 12] - Loss: 2.1859 | Accuracy: 32.64%\n",
"[Epoch 13] - Loss: 2.1847 | Accuracy: 32.65%\n",
"[Epoch 14] - Loss: 2.1833 | Accuracy: 32.76%\n",
"[Epoch 15] - Loss: 2.1821 | Accuracy: 32.75%\n",
"[Epoch 16] - Loss: 2.1813 | Accuracy: 32.74%\n",
"[Epoch 17] - Loss: 2.1806 | Accuracy: 32.84%\n",
"[Epoch 18] - Loss: 2.1799 | Accuracy: 32.81%\n",
"[Epoch 19] - Loss: 2.1792 | Accuracy: 32.80%\n",
"[Epoch 20] - Loss: 2.1786 | Accuracy: 32.81%\n",
"[Epoch 21] - Loss: 2.1780 | Accuracy: 32.77%\n",
"[Epoch 22] - Loss: 2.1776 | Accuracy: 32.85%\n",
"[Epoch 23] - Loss: 2.1770 | Accuracy: 32.81%\n",
"[Epoch 24] - Loss: 2.1767 | Accuracy: 32.81%\n",
"[Epoch 25] - Loss: 2.1764 | Accuracy: 32.81%\n",
"[Epoch 26] - Loss: 2.1757 | Accuracy: 32.80%\n",
"[Epoch 27] - Loss: 2.1755 | Accuracy: 32.81%\n",
"[Epoch 28] - Loss: 2.1751 | Accuracy: 32.79%\n",
"[Epoch 29] - Loss: 2.1748 | Accuracy: 32.82%\n",
"[Epoch 30] - Loss: 2.1744 | Accuracy: 32.80%\n"
]
}
],
"source": [
"for epoch in range(EPOCHS):\n",
" model.train()\n",
" total_loss = 0\n",
" correct = 0\n",
" total = 0\n",
"\n",
" for batch_X, batch_y in train_loader:\n",
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
"\n",
" optimizer.zero_grad()\n",
" logits = model(batch_X) # shape: (BATCH_SIZE, OUTPUT_SIZE)\n",
" loss = criterion(logits, batch_y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" total_loss += loss.item() * batch_X.size(0)\n",
"\n",
" # Compute accuracy\n",
" preds = torch.argmax(logits, dim=1)\n",
" correct += (preds == batch_y).sum().item()\n",
" total += batch_X.size(0)\n",
"\n",
" avg_loss = total_loss / total\n",
" accuracy = correct / total * 100\n",
" print(f\"[Epoch {epoch+1}] - Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%\")\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Top 1 prediction accuracy: 32.45%\n",
"Top 3 prediction accuracy: 58.55%\n",
"Top 5 prediction accuracy: 72.66%\n"
]
}
],
"source": [
"model.eval()\n",
"correct_top1 = 0\n",
"correct_top3 = 0\n",
"correct_top5 = 0\n",
"total = 0\n",
"\n",
"with torch.no_grad():\n",
" for batch_X, batch_y in test_loader:\n",
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
" outputs = model(batch_X)\n",
"\n",
" _, top_preds = outputs.topk(5, dim=1)\n",
"\n",
" for true, top5 in zip(batch_y, top_preds):\n",
" total += 1\n",
" if true == top5[0]:\n",
" correct_top1 += 1\n",
" if true in top5[:3]:\n",
" correct_top3 += 1\n",
" if true in top5:\n",
" correct_top5 += 1\n",
"\n",
"top1_acc = correct_top1 / total\n",
"top3_acc = correct_top3 / total\n",
"top5_acc = correct_top5 / total\n",
"\n",
"print(f\"Top 1 prediction accuracy: {(top1_acc * 100):.2f}%\")\n",
"print(f\"Top 3 prediction accuracy: {(top3_acc * 100):.2f}%\")\n",
"print(f\"Top 5 prediction accuracy: {(top5_acc * 100):.2f}%\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}