340 lines
9.9 KiB
Plaintext
340 lines
9.9 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 1. Import and data processing"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Import all necessary libraries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import torch\n",
|
||
"import torch.nn as nn\n",
|
||
"from torch.utils.data import DataLoader, TensorDataset, random_split"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Load the training data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"data = np.load(\"./data.npy\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Define constants that describe the data and model"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"CONTEXT_SIZE = 10\n",
|
||
"ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
|
||
"ALPHABET_SIZE = len(ALPHABET)\n",
|
||
"TRAINING_DATA_SIZE = 0.9\n",
|
||
"\n",
|
||
"# +1 is for unknown characters\n",
|
||
"VOCAB_SIZE = ALPHABET_SIZE + 1\n",
|
||
"\n",
|
||
"EMBEDDING_DIM = 10\n",
|
||
"\n",
|
||
"INPUT_SEQ_LEN = CONTEXT_SIZE\n",
|
||
"OUTPUT_SIZE = VOCAB_SIZE\n",
|
||
"\n",
|
||
"BATCH_SIZE = 2048\n",
|
||
"\n",
|
||
"EPOCHS = 30\n",
|
||
"LEARNING_RATE = 1e-3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Process the data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Input: embeddings of the previous 10 letters\n",
|
||
"# shape: (num_samples, CONTEXT_SIZE)\n",
|
||
"X = data[:, :CONTEXT_SIZE]\n",
|
||
"\n",
|
||
"# Target: current letter index\n",
|
||
"# shape: (num_samples,)\n",
|
||
"y = data[:, CONTEXT_SIZE]\n",
|
||
"\n",
|
||
"# Torch dataset (important: use long/int64 for indices)\n",
|
||
"X_tensor = torch.tensor(X, dtype=torch.long) # for nn.Embedding\n",
|
||
"y_tensor = torch.tensor(y, dtype=torch.long) # for classification target\n",
|
||
"\n",
|
||
"dataset = TensorDataset(X_tensor, y_tensor)\n",
|
||
"\n",
|
||
"train_len = int(TRAINING_DATA_SIZE * len(dataset))\n",
|
||
"train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])\n",
|
||
"\n",
|
||
"train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)\n",
|
||
"test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 2. Model"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"class Logistic(nn.Module):\n",
|
||
" def __init__(self, *, embedding_count: int, embedding_dimension_size: int, context_size: int, output_shape: int):\n",
|
||
" super().__init__()\n",
|
||
" self.embedding = nn.Embedding(num_embeddings=embedding_count, embedding_dim=embedding_dimension_size)\n",
|
||
" self.linear = nn.Linear(context_size * embedding_dimension_size, output_shape)\n",
|
||
"\n",
|
||
" def forward(self, x):\n",
|
||
" embedded = self.embedding(x) # (BATCH_SIZE, CONTEXT_SIZE, EMBEDDING_DIM)\n",
|
||
" flattened = embedded.view(x.size(0), -1) # (BATCH_SIZE, CONTEXT_SIZE * EMBEDDING_DIM)\n",
|
||
" return self.linear(flattened) # (BATCH_SIZE, OUTPUT_SIZE)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Using device: cpu\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/thastertyn/Code/Skola/4-rocnik/programove-vybaveni/omega/.venv/lib/python3.12/site-packages/torch/cuda/__init__.py:129: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.)\n",
|
||
" return torch._C._cuda_getDeviceCount() > 0\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
||
"print(f\"Using device: {device}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 3. Training"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Create fresh instance of the model"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"model = Logistic(\n",
|
||
" embedding_count=VOCAB_SIZE, # e.g., 27 for a–z + unknown\n",
|
||
" embedding_dimension_size=EMBEDDING_DIM, # e.g., 10\n",
|
||
" context_size=CONTEXT_SIZE, # e.g., 10\n",
|
||
" output_shape=OUTPUT_SIZE # e.g., 27 (next character)\n",
|
||
").to(device)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"criterion = nn.CrossEntropyLoss()\n",
|
||
"optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[Epoch 1] - Loss: 2.5968 | Accuracy: 23.06%\n",
|
||
"[Epoch 2] - Loss: 2.3218 | Accuracy: 30.02%\n",
|
||
"[Epoch 3] - Loss: 2.2600 | Accuracy: 31.25%\n",
|
||
"[Epoch 4] - Loss: 2.2325 | Accuracy: 31.55%\n",
|
||
"[Epoch 5] - Loss: 2.2171 | Accuracy: 31.75%\n",
|
||
"[Epoch 6] - Loss: 2.2076 | Accuracy: 31.98%\n",
|
||
"[Epoch 7] - Loss: 2.2006 | Accuracy: 32.22%\n",
|
||
"[Epoch 8] - Loss: 2.1962 | Accuracy: 32.36%\n",
|
||
"[Epoch 9] - Loss: 2.1925 | Accuracy: 32.42%\n",
|
||
"[Epoch 10] - Loss: 2.1900 | Accuracy: 32.48%\n",
|
||
"[Epoch 11] - Loss: 2.1876 | Accuracy: 32.54%\n",
|
||
"[Epoch 12] - Loss: 2.1859 | Accuracy: 32.64%\n",
|
||
"[Epoch 13] - Loss: 2.1847 | Accuracy: 32.65%\n",
|
||
"[Epoch 14] - Loss: 2.1833 | Accuracy: 32.76%\n",
|
||
"[Epoch 15] - Loss: 2.1821 | Accuracy: 32.75%\n",
|
||
"[Epoch 16] - Loss: 2.1813 | Accuracy: 32.74%\n",
|
||
"[Epoch 17] - Loss: 2.1806 | Accuracy: 32.84%\n",
|
||
"[Epoch 18] - Loss: 2.1799 | Accuracy: 32.81%\n",
|
||
"[Epoch 19] - Loss: 2.1792 | Accuracy: 32.80%\n",
|
||
"[Epoch 20] - Loss: 2.1786 | Accuracy: 32.81%\n",
|
||
"[Epoch 21] - Loss: 2.1780 | Accuracy: 32.77%\n",
|
||
"[Epoch 22] - Loss: 2.1776 | Accuracy: 32.85%\n",
|
||
"[Epoch 23] - Loss: 2.1770 | Accuracy: 32.81%\n",
|
||
"[Epoch 24] - Loss: 2.1767 | Accuracy: 32.81%\n",
|
||
"[Epoch 25] - Loss: 2.1764 | Accuracy: 32.81%\n",
|
||
"[Epoch 26] - Loss: 2.1757 | Accuracy: 32.80%\n",
|
||
"[Epoch 27] - Loss: 2.1755 | Accuracy: 32.81%\n",
|
||
"[Epoch 28] - Loss: 2.1751 | Accuracy: 32.79%\n",
|
||
"[Epoch 29] - Loss: 2.1748 | Accuracy: 32.82%\n",
|
||
"[Epoch 30] - Loss: 2.1744 | Accuracy: 32.80%\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for epoch in range(EPOCHS):\n",
|
||
" model.train()\n",
|
||
" total_loss = 0\n",
|
||
" correct = 0\n",
|
||
" total = 0\n",
|
||
"\n",
|
||
" for batch_X, batch_y in train_loader:\n",
|
||
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
|
||
"\n",
|
||
" optimizer.zero_grad()\n",
|
||
" logits = model(batch_X) # shape: (BATCH_SIZE, OUTPUT_SIZE)\n",
|
||
" loss = criterion(logits, batch_y)\n",
|
||
" loss.backward()\n",
|
||
" optimizer.step()\n",
|
||
"\n",
|
||
" total_loss += loss.item() * batch_X.size(0)\n",
|
||
"\n",
|
||
" # Compute accuracy\n",
|
||
" preds = torch.argmax(logits, dim=1)\n",
|
||
" correct += (preds == batch_y).sum().item()\n",
|
||
" total += batch_X.size(0)\n",
|
||
"\n",
|
||
" avg_loss = total_loss / total\n",
|
||
" accuracy = correct / total * 100\n",
|
||
" print(f\"[Epoch {epoch+1}] - Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Top 1 prediction accuracy: 32.45%\n",
|
||
"Top 3 prediction accuracy: 58.55%\n",
|
||
"Top 5 prediction accuracy: 72.66%\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"model.eval()\n",
|
||
"correct_top1 = 0\n",
|
||
"correct_top3 = 0\n",
|
||
"correct_top5 = 0\n",
|
||
"total = 0\n",
|
||
"\n",
|
||
"with torch.no_grad():\n",
|
||
" for batch_X, batch_y in test_loader:\n",
|
||
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
|
||
" outputs = model(batch_X)\n",
|
||
"\n",
|
||
" _, top_preds = outputs.topk(5, dim=1)\n",
|
||
"\n",
|
||
" for true, top5 in zip(batch_y, top_preds):\n",
|
||
" total += 1\n",
|
||
" if true == top5[0]:\n",
|
||
" correct_top1 += 1\n",
|
||
" if true in top5[:3]:\n",
|
||
" correct_top3 += 1\n",
|
||
" if true in top5:\n",
|
||
" correct_top5 += 1\n",
|
||
"\n",
|
||
"top1_acc = correct_top1 / total\n",
|
||
"top3_acc = correct_top3 / total\n",
|
||
"top5_acc = correct_top5 / total\n",
|
||
"\n",
|
||
"print(f\"Top 1 prediction accuracy: {(top1_acc * 100):.2f}%\")\n",
|
||
"print(f\"Top 3 prediction accuracy: {(top3_acc * 100):.2f}%\")\n",
|
||
"print(f\"Top 5 prediction accuracy: {(top5_acc * 100):.2f}%\")"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|