385 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			385 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | ||
|  "cells": [
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "# Import data"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 1,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "import numpy as np\n",
 | ||
|     "import torch\n",
 | ||
|     "import torch.nn as nn\n",
 | ||
|     "from torch.utils.data import DataLoader, TensorDataset, random_split"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 2,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "data = np.load(\"./data.npy\")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 3,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "CONTEXT_SIZE = 10\n",
 | ||
|     "ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
 | ||
|     "ALPHABET_SIZE = len(ALPHABET)\n",
 | ||
|     "TRAINING_DATA_SIZE = 0.9\n",
 | ||
|     "\n",
 | ||
|     "VOCAB_SIZE = ALPHABET_SIZE + 1 # 26 letters + 1 for unknown\n",
 | ||
|     "EMBEDDING_DIM = 16\n",
 | ||
|     "\n",
 | ||
|     "INPUT_SEQ_LEN = CONTEXT_SIZE\n",
 | ||
|     "OUTPUT_SIZE = VOCAB_SIZE"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "# Define and split data"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "## Define input and output columns"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 4,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "X = data[:, :CONTEXT_SIZE]  # shape: (num_samples, CONTEXT_SIZE)\n",
 | ||
|     "\n",
 | ||
|     "# Target: current letter index\n",
 | ||
|     "y = data[:, CONTEXT_SIZE]   # shape: (num_samples,)\n",
 | ||
|     "\n",
 | ||
|     "# Torch dataset (important: use long/int64 for indices)\n",
 | ||
|     "X_tensor = torch.tensor(X, dtype=torch.long)   # for nn.Embedding\n",
 | ||
|     "y_tensor = torch.tensor(y, dtype=torch.long)   # for classification target\n",
 | ||
|     "\n",
 | ||
|     "dataset = TensorDataset(X_tensor, y_tensor)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 5,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "train_len = int(TRAINING_DATA_SIZE * len(dataset))\n",
 | ||
|     "train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 6,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "train_loader = DataLoader(train_set, batch_size=128, shuffle=True)\n",
 | ||
|     "test_loader = DataLoader(test_set, batch_size=128)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "# Train on data"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": null,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "class MLP(nn.Module):\n",
 | ||
|     "    def __init__(self):\n",
 | ||
|     "        super().__init__()\n",
 | ||
|     "        self.net = nn.Sequential(\n",
 | ||
|     "            nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM),\n",
 | ||
|     "            nn.Flatten(),\n",
 | ||
|     "            nn.Linear(CONTEXT_SIZE * EMBEDDING_DIM, 256),\n",
 | ||
|     "            nn.ReLU(),\n",
 | ||
|     "            nn.Linear(256, 128),\n",
 | ||
|     "            nn.ReLU(),\n",
 | ||
|     "            nn.Linear(128, OUTPUT_SIZE)\n",
 | ||
|     "        )\n",
 | ||
|     "\n",
 | ||
|     "    def forward(self, x):\n",
 | ||
|     "        return self.net(x)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 8,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Using device: cpu\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "/home/thastertyn/Code/Skola/4-rocnik/programove-vybaveni/omega/.venv/lib/python3.12/site-packages/torch/cuda/__init__.py:129: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.)\n",
 | ||
|       "  return torch._C._cuda_getDeviceCount() > 0\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 | ||
|     "print(f\"Using device: {device}\")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 9,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "model = MLP().to(device)\n",
 | ||
|     "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
 | ||
|     "\n",
 | ||
|     "criterion = nn.CrossEntropyLoss()"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 10,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Epoch 1, Loss: 4068.5562\n",
 | ||
|       "Epoch 2, Loss: 3446.1109\n",
 | ||
|       "Epoch 3, Loss: 3260.1651\n",
 | ||
|       "Epoch 4, Loss: 3165.0248\n",
 | ||
|       "Epoch 5, Loss: 3101.6501\n",
 | ||
|       "Epoch 6, Loss: 3054.4113\n",
 | ||
|       "Epoch 7, Loss: 3021.7103\n",
 | ||
|       "Epoch 8, Loss: 2994.6145\n",
 | ||
|       "Epoch 9, Loss: 2973.1683\n",
 | ||
|       "Epoch 10, Loss: 2955.0090\n",
 | ||
|       "Epoch 11, Loss: 2940.0807\n",
 | ||
|       "Epoch 12, Loss: 2928.2814\n",
 | ||
|       "Epoch 13, Loss: 2916.9362\n",
 | ||
|       "Epoch 14, Loss: 2905.9567\n",
 | ||
|       "Epoch 15, Loss: 2897.3687\n",
 | ||
|       "Epoch 16, Loss: 2890.6869\n",
 | ||
|       "Epoch 17, Loss: 2882.7104\n",
 | ||
|       "Epoch 18, Loss: 2876.6815\n",
 | ||
|       "Epoch 19, Loss: 2870.7298\n",
 | ||
|       "Epoch 20, Loss: 2865.6343\n",
 | ||
|       "Epoch 21, Loss: 2860.5506\n",
 | ||
|       "Epoch 22, Loss: 2856.7977\n",
 | ||
|       "Epoch 23, Loss: 2852.8814\n",
 | ||
|       "Epoch 24, Loss: 2847.7687\n",
 | ||
|       "Epoch 25, Loss: 2846.0855\n",
 | ||
|       "Epoch 26, Loss: 2842.2640\n",
 | ||
|       "Epoch 27, Loss: 2838.4780\n",
 | ||
|       "Epoch 28, Loss: 2836.9773\n",
 | ||
|       "Epoch 29, Loss: 2833.8416\n",
 | ||
|       "Epoch 30, Loss: 2830.5508\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "for epoch in range(30):\n",
 | ||
|     "    model.train()\n",
 | ||
|     "    total_loss = 0\n",
 | ||
|     "    for batch_X, batch_y in train_loader:\n",
 | ||
|     "        batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
 | ||
|     "        optimizer.zero_grad()\n",
 | ||
|     "        output = model(batch_X)\n",
 | ||
|     "        loss = criterion(output, batch_y)\n",
 | ||
|     "        loss.backward()\n",
 | ||
|     "        optimizer.step()\n",
 | ||
|     "        total_loss += loss.item()\n",
 | ||
|     "    print(f\"Epoch {epoch+1}, Loss: {total_loss:.4f}\")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "# Testing model"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 12,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Top-1 Accuracy: 52.77%\n",
 | ||
|       "Top-3 Accuracy: 74.39%\n",
 | ||
|       "Top-5 Accuracy: 83.37%\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "model.eval()\n",
 | ||
|     "correct_top1 = 0\n",
 | ||
|     "correct_top3 = 0\n",
 | ||
|     "correct_top5 = 0\n",
 | ||
|     "total = 0\n",
 | ||
|     "\n",
 | ||
|     "with torch.no_grad():\n",
 | ||
|     "    for batch_X, batch_y in test_loader:\n",
 | ||
|     "        batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
 | ||
|     "        outputs = model(batch_X)  # shape: [batch_size, 26]\n",
 | ||
|     "\n",
 | ||
|     "        # Get top-5 predictions\n",
 | ||
|     "        _, top_preds = outputs.topk(5, dim=1)  # shape: [batch_size, 5]\n",
 | ||
|     "\n",
 | ||
|     "        for true, top5 in zip(batch_y, top_preds):\n",
 | ||
|     "            total += 1\n",
 | ||
|     "            if true == top5[0]:\n",
 | ||
|     "                correct_top1 += 1\n",
 | ||
|     "            if true in top5[:3]:\n",
 | ||
|     "                correct_top3 += 1\n",
 | ||
|     "            if true in top5:\n",
 | ||
|     "                correct_top5 += 1\n",
 | ||
|     "\n",
 | ||
|     "top1_acc = correct_top1 / total\n",
 | ||
|     "top3_acc = correct_top3 / total\n",
 | ||
|     "top5_acc = correct_top5 / total\n",
 | ||
|     "\n",
 | ||
|     "print(f\"Top-1 Accuracy: {top1_acc * 100:.2f}%\")\n",
 | ||
|     "print(f\"Top-3 Accuracy: {top3_acc * 100:.2f}%\")\n",
 | ||
|     "print(f\"Top-5 Accuracy: {top5_acc * 100:.2f}%\")\n"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 38,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "torch.save(model.state_dict(), \"mlp_weights.pth\")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": null,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "torch.save(model, \"mlp_full_model.pth\")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 18,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "# Reuse same alphabet + mapping\n",
 | ||
|     "alphabet = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
 | ||
|     "char_to_idx = {ch: idx for idx, ch in enumerate(alphabet)}\n",
 | ||
|     "PAD_IDX = len(alphabet)  # index 26 for OOV/padding\n",
 | ||
|     "VOCAB_SIZE = len(alphabet) + 1  # 27 total (a–z + padding)\n",
 | ||
|     "CONTEXT_SIZE = 10\n",
 | ||
|     "\n",
 | ||
|     "idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}\n",
 | ||
|     "idx_to_char[PAD_IDX] = \"_\"  # for readability\n",
 | ||
|     "\n",
 | ||
|     "def preprocess_input(context: str) -> torch.Tensor:\n",
 | ||
|     "    context = context.lower()\n",
 | ||
|     "    padded = context.rjust(CONTEXT_SIZE, \"_\")  # pad with underscores (or any 1-char symbol)\n",
 | ||
|     "\n",
 | ||
|     "    indices = []\n",
 | ||
|     "    for ch in padded[-CONTEXT_SIZE:]:\n",
 | ||
|     "        idx = char_to_idx.get(ch, PAD_IDX)  # if '_' or unknown → PAD_IDX (26)\n",
 | ||
|     "        indices.append(idx)\n",
 | ||
|     "\n",
 | ||
|     "    return torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)\n",
 | ||
|     "\n",
 | ||
|     "\n",
 | ||
|     "def predict_next_chars(model, context: str, top_k=5):\n",
 | ||
|     "    model.eval()\n",
 | ||
|     "    input_tensor = preprocess_input(context)\n",
 | ||
|     "    with torch.no_grad():\n",
 | ||
|     "        logits = model(input_tensor)\n",
 | ||
|     "        probs = torch.softmax(logits, dim=-1)\n",
 | ||
|     "        top_probs, top_indices = probs.topk(top_k, dim=-1)\n",
 | ||
|     "\n",
 | ||
|     "    predictions = [(idx_to_char[idx.item()], top_probs[0, i].item()) for i, idx in enumerate(top_indices[0])]\n",
 | ||
|     "    return predictions\n"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 37,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "A: 0.4302\n",
 | ||
|       "T: 0.2897\n",
 | ||
|       "E: 0.1538\n",
 | ||
|       "I: 0.0905\n",
 | ||
|       "C: 0.0159\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "preds = predict_next_chars(model, \"doors\")\n",
 | ||
|     "for char, prob in preds:\n",
 | ||
|     "    print(f\"{char.upper()}: {prob:.4f}\")\n"
 | ||
|    ]
 | ||
|   }
 | ||
|  ],
 | ||
|  "metadata": {
 | ||
|   "kernelspec": {
 | ||
|    "display_name": ".venv",
 | ||
|    "language": "python",
 | ||
|    "name": "python3"
 | ||
|   },
 | ||
|   "language_info": {
 | ||
|    "codemirror_mode": {
 | ||
|     "name": "ipython",
 | ||
|     "version": 3
 | ||
|    },
 | ||
|    "file_extension": ".py",
 | ||
|    "mimetype": "text/x-python",
 | ||
|    "name": "python",
 | ||
|    "nbconvert_exporter": "python",
 | ||
|    "pygments_lexer": "ipython3",
 | ||
|    "version": "3.12.3"
 | ||
|   }
 | ||
|  },
 | ||
|  "nbformat": 4,
 | ||
|  "nbformat_minor": 2
 | ||
| }
 |