302 lines
7.6 KiB
Plaintext
302 lines
7.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Import data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import torch\n",
|
|
"import torch.nn as nn\n",
|
|
"from torch.utils.data import DataLoader, TensorDataset, random_split"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 50,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data = np.load(\"./data.npy\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 54,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"CONTEXT_SIZE = 10\n",
|
|
"ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
|
|
"ALPHABET_SIZE = len(ALPHABET)\n",
|
|
"TRAINING_DATA_SIZE = 0.9\n",
|
|
"\n",
|
|
"\n",
|
|
"# Derived values\n",
|
|
"PREV_LETTER_FEATURES = CONTEXT_SIZE * ALPHABET_SIZE\n",
|
|
"CURR_LETTER_FEATURES = ALPHABET_SIZE\n",
|
|
"OTHER_FEATURES = 3 # is_start, prev_type, word_length\n",
|
|
"\n",
|
|
"TOTAL_FEATURES = PREV_LETTER_FEATURES + CURR_LETTER_FEATURES + OTHER_FEATURES\n",
|
|
"\n",
|
|
"INPUT_SIZE = PREV_LETTER_FEATURES + OTHER_FEATURES\n",
|
|
"OUTPUT_SIZE = ALPHABET_SIZE"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Define and split data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Define input and output columns"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 52,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = np.hstack([\n",
|
|
" data[:, :PREV_LETTER_FEATURES],\n",
|
|
" data[:, PREV_LETTER_FEATURES + CURR_LETTER_FEATURES:TOTAL_FEATURES]\n",
|
|
"])\n",
|
|
"\n",
|
|
"# Extract current letter (one-hot target)\n",
|
|
"y_onehot = data[:, PREV_LETTER_FEATURES:PREV_LETTER_FEATURES + CURR_LETTER_FEATURES]\n",
|
|
"y = np.argmax(y_onehot, axis=1)\n",
|
|
"\n",
|
|
"# Torch dataset\n",
|
|
"X_tensor = torch.tensor(X, dtype=torch.float32)\n",
|
|
"y_tensor = torch.tensor(y, dtype=torch.long)\n",
|
|
"\n",
|
|
"dataset = TensorDataset(X_tensor, y_tensor)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 55,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_len = int(TRAINING_DATA_SIZE * len(dataset))\n",
|
|
"train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 56,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_loader = DataLoader(train_set, batch_size=128, shuffle=True)\n",
|
|
"test_loader = DataLoader(test_set, batch_size=128)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Train on data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 79,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class MLP(nn.Module):\n",
|
|
" def __init__(self):\n",
|
|
" super().__init__()\n",
|
|
" self.net = nn.Sequential(\n",
|
|
" nn.Linear(INPUT_SIZE, 256),\n",
|
|
" nn.ReLU(),\n",
|
|
" nn.Linear(256, 128),\n",
|
|
" nn.ReLU(),\n",
|
|
" nn.Linear(128, OUTPUT_SIZE)\n",
|
|
" )\n",
|
|
"\n",
|
|
" def forward(self, x):\n",
|
|
" return self.net(x)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 80,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Using device: cuda\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
|
"print(f\"Using device: {device}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 81,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = MLP().to(device)\n",
|
|
"optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
|
|
"criterion = nn.CrossEntropyLoss()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 82,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1, Loss: 4277.8506\n",
|
|
"Epoch 2, Loss: 3647.3064\n",
|
|
"Epoch 3, Loss: 3421.2898\n",
|
|
"Epoch 4, Loss: 3289.9248\n",
|
|
"Epoch 5, Loss: 3203.0331\n",
|
|
"Epoch 6, Loss: 3141.4064\n",
|
|
"Epoch 7, Loss: 3099.4711\n",
|
|
"Epoch 8, Loss: 3065.2254\n",
|
|
"Epoch 9, Loss: 3040.1093\n",
|
|
"Epoch 10, Loss: 3016.0812\n",
|
|
"Epoch 11, Loss: 2998.2589\n",
|
|
"Epoch 12, Loss: 2982.5763\n",
|
|
"Epoch 13, Loss: 2968.7752\n",
|
|
"Epoch 14, Loss: 2956.6091\n",
|
|
"Epoch 15, Loss: 2945.3793\n",
|
|
"Epoch 16, Loss: 2935.6520\n",
|
|
"Epoch 17, Loss: 2928.2420\n",
|
|
"Epoch 18, Loss: 2918.6128\n",
|
|
"Epoch 19, Loss: 2912.0454\n",
|
|
"Epoch 20, Loss: 2904.7236\n",
|
|
"Epoch 21, Loss: 2898.5873\n",
|
|
"Epoch 22, Loss: 2893.1154\n",
|
|
"Epoch 23, Loss: 2887.1008\n",
|
|
"Epoch 24, Loss: 2884.5473\n",
|
|
"Epoch 25, Loss: 2879.1589\n",
|
|
"Epoch 26, Loss: 2874.9795\n",
|
|
"Epoch 27, Loss: 2870.3030\n",
|
|
"Epoch 28, Loss: 2867.0953\n",
|
|
"Epoch 29, Loss: 2863.1449\n",
|
|
"Epoch 30, Loss: 2859.8749\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for epoch in range(30):\n",
|
|
" model.train()\n",
|
|
" total_loss = 0\n",
|
|
" for batch_X, batch_y in train_loader:\n",
|
|
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
|
|
" optimizer.zero_grad()\n",
|
|
" output = model(batch_X)\n",
|
|
" loss = criterion(output, batch_y)\n",
|
|
" loss.backward()\n",
|
|
" optimizer.step()\n",
|
|
" total_loss += loss.item()\n",
|
|
" print(f\"Epoch {epoch+1}, Loss: {total_loss:.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Testing model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 83,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Top-1 Accuracy: 51.27%\n",
|
|
"Top-3 Accuracy: 73.68%\n",
|
|
"Top-5 Accuracy: 82.94%\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"model.eval()\n",
|
|
"correct_top1 = 0\n",
|
|
"correct_top3 = 0\n",
|
|
"correct_top5 = 0\n",
|
|
"total = 0\n",
|
|
"\n",
|
|
"with torch.no_grad():\n",
|
|
" for batch_X, batch_y in test_loader:\n",
|
|
" batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
|
|
" outputs = model(batch_X) # shape: [batch_size, 26]\n",
|
|
"\n",
|
|
" # Get top-5 predictions\n",
|
|
" _, top_preds = outputs.topk(5, dim=1) # shape: [batch_size, 5]\n",
|
|
"\n",
|
|
" for true, top5 in zip(batch_y, top_preds):\n",
|
|
" total += 1\n",
|
|
" if true == top5[0]:\n",
|
|
" correct_top1 += 1\n",
|
|
" if true in top5[:3]:\n",
|
|
" correct_top3 += 1\n",
|
|
" if true in top5:\n",
|
|
" correct_top5 += 1\n",
|
|
"\n",
|
|
"top1_acc = correct_top1 / total\n",
|
|
"top3_acc = correct_top3 / total\n",
|
|
"top5_acc = correct_top5 / total\n",
|
|
"\n",
|
|
"print(f\"Top-1 Accuracy: {top1_acc * 100:.2f}%\")\n",
|
|
"print(f\"Top-3 Accuracy: {top3_acc * 100:.2f}%\")\n",
|
|
"print(f\"Top-5 Accuracy: {top5_acc * 100:.2f}%\")\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|