omega/model/notebook.ipynb
2025-03-31 23:58:19 +02:00

385 lines
10 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import torch\n",
"import torch.nn as nn\n",
"from torch.utils.data import DataLoader, TensorDataset, random_split"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data = np.load(\"./data.npy\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"CONTEXT_SIZE = 10\n",
"ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
"ALPHABET_SIZE = len(ALPHABET)\n",
"TRAINING_DATA_SIZE = 0.9\n",
"\n",
"VOCAB_SIZE = ALPHABET_SIZE + 1 # 26 letters + 1 for unknown\n",
"EMBEDDING_DIM = 16\n",
"\n",
"INPUT_SEQ_LEN = CONTEXT_SIZE\n",
"OUTPUT_SIZE = VOCAB_SIZE"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Define and split data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define input and output columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"X = data[:, :CONTEXT_SIZE] # shape: (num_samples, CONTEXT_SIZE)\n",
"\n",
"# Target: current letter index\n",
"y = data[:, CONTEXT_SIZE] # shape: (num_samples,)\n",
"\n",
"# Torch dataset (important: use long/int64 for indices)\n",
"X_tensor = torch.tensor(X, dtype=torch.long) # for nn.Embedding\n",
"y_tensor = torch.tensor(y, dtype=torch.long) # for classification target\n",
"\n",
"dataset = TensorDataset(X_tensor, y_tensor)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"train_len = int(TRAINING_DATA_SIZE * len(dataset))\n",
"train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"train_loader = DataLoader(train_set, batch_size=128, shuffle=True)\n",
"test_loader = DataLoader(test_set, batch_size=128)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train on data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class MLP(nn.Module):\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.net = nn.Sequential(\n",
" nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM),\n",
" nn.Flatten(),\n",
" nn.Linear(CONTEXT_SIZE * EMBEDDING_DIM, 256),\n",
" nn.ReLU(),\n",
" nn.Linear(256, 128),\n",
" nn.ReLU(),\n",
" nn.Linear(128, OUTPUT_SIZE)\n",
" )\n",
"\n",
" def forward(self, x):\n",
" return self.net(x)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cpu\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/thastertyn/Code/Skola/4-rocnik/programove-vybaveni/omega/.venv/lib/python3.12/site-packages/torch/cuda/__init__.py:129: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.)\n",
" return torch._C._cuda_getDeviceCount() > 0\n"
]
}
],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"print(f\"Using device: {device}\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"model = MLP().to(device)\n",
"optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
"\n",
"criterion = nn.CrossEntropyLoss()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1, Loss: 4068.5562\n",
"Epoch 2, Loss: 3446.1109\n",
"Epoch 3, Loss: 3260.1651\n",
"Epoch 4, Loss: 3165.0248\n",
"Epoch 5, Loss: 3101.6501\n",
"Epoch 6, Loss: 3054.4113\n",
"Epoch 7, Loss: 3021.7103\n",
"Epoch 8, Loss: 2994.6145\n",
"Epoch 9, Loss: 2973.1683\n",
"Epoch 10, Loss: 2955.0090\n",
"Epoch 11, Loss: 2940.0807\n",
"Epoch 12, Loss: 2928.2814\n",
"Epoch 13, Loss: 2916.9362\n",
"Epoch 14, Loss: 2905.9567\n",
"Epoch 15, Loss: 2897.3687\n",
"Epoch 16, Loss: 2890.6869\n",
"Epoch 17, Loss: 2882.7104\n",
"Epoch 18, Loss: 2876.6815\n",
"Epoch 19, Loss: 2870.7298\n",
"Epoch 20, Loss: 2865.6343\n",
"Epoch 21, Loss: 2860.5506\n",
"Epoch 22, Loss: 2856.7977\n",
"Epoch 23, Loss: 2852.8814\n",
"Epoch 24, Loss: 2847.7687\n",
"Epoch 25, Loss: 2846.0855\n",
"Epoch 26, Loss: 2842.2640\n",
"Epoch 27, Loss: 2838.4780\n",
"Epoch 28, Loss: 2836.9773\n",
"Epoch 29, Loss: 2833.8416\n",
"Epoch 30, Loss: 2830.5508\n"
]
}
],
"source": [
"for epoch in range(30):\n",
" model.train()\n",
" total_loss = 0\n",
" for batch_X, batch_y in train_loader:\n",
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
" optimizer.zero_grad()\n",
" output = model(batch_X)\n",
" loss = criterion(output, batch_y)\n",
" loss.backward()\n",
" optimizer.step()\n",
" total_loss += loss.item()\n",
" print(f\"Epoch {epoch+1}, Loss: {total_loss:.4f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Testing model"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Top-1 Accuracy: 52.77%\n",
"Top-3 Accuracy: 74.39%\n",
"Top-5 Accuracy: 83.37%\n"
]
}
],
"source": [
"model.eval()\n",
"correct_top1 = 0\n",
"correct_top3 = 0\n",
"correct_top5 = 0\n",
"total = 0\n",
"\n",
"with torch.no_grad():\n",
" for batch_X, batch_y in test_loader:\n",
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
" outputs = model(batch_X) # shape: [batch_size, 26]\n",
"\n",
" # Get top-5 predictions\n",
" _, top_preds = outputs.topk(5, dim=1) # shape: [batch_size, 5]\n",
"\n",
" for true, top5 in zip(batch_y, top_preds):\n",
" total += 1\n",
" if true == top5[0]:\n",
" correct_top1 += 1\n",
" if true in top5[:3]:\n",
" correct_top3 += 1\n",
" if true in top5:\n",
" correct_top5 += 1\n",
"\n",
"top1_acc = correct_top1 / total\n",
"top3_acc = correct_top3 / total\n",
"top5_acc = correct_top5 / total\n",
"\n",
"print(f\"Top-1 Accuracy: {top1_acc * 100:.2f}%\")\n",
"print(f\"Top-3 Accuracy: {top3_acc * 100:.2f}%\")\n",
"print(f\"Top-5 Accuracy: {top5_acc * 100:.2f}%\")\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), \"mlp_weights.pth\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model, \"mlp_full_model.pth\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# Reuse same alphabet + mapping\n",
"alphabet = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
"char_to_idx = {ch: idx for idx, ch in enumerate(alphabet)}\n",
"PAD_IDX = len(alphabet) # index 26 for OOV/padding\n",
"VOCAB_SIZE = len(alphabet) + 1 # 27 total (az + padding)\n",
"CONTEXT_SIZE = 10\n",
"\n",
"idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}\n",
"idx_to_char[PAD_IDX] = \"_\" # for readability\n",
"\n",
"def preprocess_input(context: str) -> torch.Tensor:\n",
" context = context.lower()\n",
" padded = context.rjust(CONTEXT_SIZE, \"_\") # pad with underscores (or any 1-char symbol)\n",
"\n",
" indices = []\n",
" for ch in padded[-CONTEXT_SIZE:]:\n",
" idx = char_to_idx.get(ch, PAD_IDX) # if '_' or unknown → PAD_IDX (26)\n",
" indices.append(idx)\n",
"\n",
" return torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)\n",
"\n",
"\n",
"def predict_next_chars(model, context: str, top_k=5):\n",
" model.eval()\n",
" input_tensor = preprocess_input(context)\n",
" with torch.no_grad():\n",
" logits = model(input_tensor)\n",
" probs = torch.softmax(logits, dim=-1)\n",
" top_probs, top_indices = probs.topk(top_k, dim=-1)\n",
"\n",
" predictions = [(idx_to_char[idx.item()], top_probs[0, i].item()) for i, idx in enumerate(top_indices[0])]\n",
" return predictions\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A: 0.4302\n",
"T: 0.2897\n",
"E: 0.1538\n",
"I: 0.0905\n",
"C: 0.0159\n"
]
}
],
"source": [
"preds = predict_next_chars(model, \"doors\")\n",
"for char, prob in preds:\n",
" print(f\"{char.upper()}: {prob:.4f}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}