From e44a6cf86e2ac770cd6e47482743e09b4996411e Mon Sep 17 00:00:00 2001 From: Thastertyn Date: Sun, 23 Mar 2025 22:09:28 +0100 Subject: [PATCH] Switched to pytorch with multi layer perceptron --- .gitignore | 1 + notebook.ipynb | 641 ++++++++++++++----------------------------------- transform.py | 70 ++++++ 3 files changed, 247 insertions(+), 465 deletions(-) create mode 100755 transform.py diff --git a/.gitignore b/.gitignore index 0dbf2f2..dd60191 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +data.npy \ No newline at end of file diff --git a/notebook.ipynb b/notebook.ipynb index 74c23ae..8bff233 100644 --- a/notebook.ipynb +++ b/notebook.ipynb @@ -9,12 +9,46 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "data = pd.read_csv(\"./out.txt\", sep=',')" + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import DataLoader, TensorDataset, random_split" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "data = np.load(\"./data.npy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "CONTEXT_SIZE = 10\n", + "ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n", + "ALPHABET_SIZE = len(ALPHABET)\n", + "TRAINING_DATA_SIZE = 0.9\n", + "\n", + "\n", + "# Derived values\n", + "PREV_LETTER_FEATURES = CONTEXT_SIZE * ALPHABET_SIZE\n", + "CURR_LETTER_FEATURES = ALPHABET_SIZE\n", + "OTHER_FEATURES = 3 # is_start, prev_type, word_length\n", + "\n", + "TOTAL_FEATURES = PREV_LETTER_FEATURES + CURR_LETTER_FEATURES + OTHER_FEATURES\n", + "\n", + "INPUT_SIZE = PREV_LETTER_FEATURES + OTHER_FEATURES\n", + "OUTPUT_SIZE = ALPHABET_SIZE" ] }, { @@ -33,31 +67,44 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ - "input_features = ['previous_5','previous_4','previous_3','previous_2','previous_1','is_start','previous_type','word_length']\n", - "target_feature = 'current'" + "X = np.hstack([\n", + " data[:, :PREV_LETTER_FEATURES],\n", + " data[:, PREV_LETTER_FEATURES + CURR_LETTER_FEATURES:TOTAL_FEATURES]\n", + "])\n", + "\n", + "# Extract current letter (one-hot target)\n", + "y_onehot = data[:, PREV_LETTER_FEATURES:PREV_LETTER_FEATURES + CURR_LETTER_FEATURES]\n", + "y = np.argmax(y_onehot, axis=1)\n", + "\n", + "# Torch dataset\n", + "X_tensor = torch.tensor(X, dtype=torch.float32)\n", + "y_tensor = torch.tensor(y, dtype=torch.long)\n", + "\n", + "dataset = TensorDataset(X_tensor, y_tensor)\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split" + "train_len = int(TRAINING_DATA_SIZE * len(dataset))\n", + "train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ - "test_size = 0.1 # @param {\"type\":\"number\",\"placeholder\":\"0.1\"}\n", - "X_train, X_test, y_train, y_test = train_test_split(data[input_features], data[target_feature], test_size=test_size)" + "train_loader = DataLoader(train_set, batch_size=128, shuffle=True)\n", + "test_loader = DataLoader(test_set, batch_size=128)" ] }, { @@ -69,458 +116,109 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LogisticRegression" + "class MLP(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.net = nn.Sequential(\n", + " nn.Linear(INPUT_SIZE, 256),\n", + " nn.ReLU(),\n", + " nn.Linear(256, 128),\n", + " nn.ReLU(),\n", + " nn.Linear(128, OUTPUT_SIZE)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.net(x)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 80, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
LogisticRegression(max_iter=10000, multi_class='multinomial', n_jobs=10,\n",
-       "                   solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "LogisticRegression(max_iter=10000, multi_class='multinomial', n_jobs=10,\n", - " solver='saga')" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] } ], "source": [ - "model = LogisticRegression(multi_class=\"multinomial\", solver=\"saga\", max_iter=10_000, n_jobs=10)\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create new model which predicts probability" + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "print(f\"Using device: {device}\")" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ - "y_pred = model.predict(X_test)" + "model = MLP().to(device)\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n", + "criterion = nn.CrossEntropyLoss()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1, Loss: 4277.8506\n", + "Epoch 2, Loss: 3647.3064\n", + "Epoch 3, Loss: 3421.2898\n", + "Epoch 4, Loss: 3289.9248\n", + "Epoch 5, Loss: 3203.0331\n", + "Epoch 6, Loss: 3141.4064\n", + "Epoch 7, Loss: 3099.4711\n", + "Epoch 8, Loss: 3065.2254\n", + "Epoch 9, Loss: 3040.1093\n", + "Epoch 10, Loss: 3016.0812\n", + "Epoch 11, Loss: 2998.2589\n", + "Epoch 12, Loss: 2982.5763\n", + "Epoch 13, Loss: 2968.7752\n", + "Epoch 14, Loss: 2956.6091\n", + "Epoch 15, Loss: 2945.3793\n", + "Epoch 16, Loss: 2935.6520\n", + "Epoch 17, Loss: 2928.2420\n", + "Epoch 18, Loss: 2918.6128\n", + "Epoch 19, Loss: 2912.0454\n", + "Epoch 20, Loss: 2904.7236\n", + "Epoch 21, Loss: 2898.5873\n", + "Epoch 22, Loss: 2893.1154\n", + "Epoch 23, Loss: 2887.1008\n", + "Epoch 24, Loss: 2884.5473\n", + "Epoch 25, Loss: 2879.1589\n", + "Epoch 26, Loss: 2874.9795\n", + "Epoch 27, Loss: 2870.3030\n", + "Epoch 28, Loss: 2867.0953\n", + "Epoch 29, Loss: 2863.1449\n", + "Epoch 30, Loss: 2859.8749\n" + ] + } + ], + "source": [ + "for epoch in range(30):\n", + " model.train()\n", + " total_loss = 0\n", + " for batch_X, batch_y in train_loader:\n", + " batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n", + " optimizer.zero_grad()\n", + " output = model(batch_X)\n", + " loss = criterion(output, batch_y)\n", + " loss.backward()\n", + " optimizer.step()\n", + " total_loss += loss.item()\n", + " print(f\"Epoch {epoch+1}, Loss: {total_loss:.4f}\")" ] }, { @@ -532,43 +230,56 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "acc = accuracy_score(y_test, y_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.211\n" + "Top-1 Accuracy: 51.27%\n", + "Top-3 Accuracy: 73.68%\n", + "Top-5 Accuracy: 82.94%\n" ] } ], "source": [ - "print(f\"Accuracy: {acc:.3f}\")" + "model.eval()\n", + "correct_top1 = 0\n", + "correct_top3 = 0\n", + "correct_top5 = 0\n", + "total = 0\n", + "\n", + "with torch.no_grad():\n", + " for batch_X, batch_y in test_loader:\n", + " batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n", + " outputs = model(batch_X) # shape: [batch_size, 26]\n", + "\n", + " # Get top-5 predictions\n", + " _, top_preds = outputs.topk(5, dim=1) # shape: [batch_size, 5]\n", + "\n", + " for true, top5 in zip(batch_y, top_preds):\n", + " total += 1\n", + " if true == top5[0]:\n", + " correct_top1 += 1\n", + " if true in top5[:3]:\n", + " correct_top3 += 1\n", + " if true in top5:\n", + " correct_top5 += 1\n", + "\n", + "top1_acc = correct_top1 / total\n", + "top3_acc = correct_top3 / total\n", + "top5_acc = correct_top5 / total\n", + "\n", + "print(f\"Top-1 Accuracy: {top1_acc * 100:.2f}%\")\n", + "print(f\"Top-3 Accuracy: {top3_acc * 100:.2f}%\")\n", + "print(f\"Top-5 Accuracy: {top5_acc * 100:.2f}%\")\n" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, diff --git a/transform.py b/transform.py new file mode 100755 index 0000000..b7d6b01 --- /dev/null +++ b/transform.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +from typing import Literal, List, Dict +import numpy as np + +INPUT_FILE: str = "./data/all_cleaned_words.txt" +OUTPUT_FILE: str = "./data.npy" + +alphabet: List[str] = list("abcdefghijklmnopqrstuvwxyz") +vowels: set[str] = set("aeiouy") + +char_to_onehot: Dict[str, List[int]] = { + ch: [1 if i == idx else 0 for i in range(26)] + for idx, ch in enumerate(alphabet) +} +empty_vec: List[int] = [0] * 26 + + +def get_prev_type(c: str) -> Literal[0, 1, 2]: + if c in vowels: + return 1 + elif c in alphabet: + return 2 + return 0 + + +def encode_letter(c: str) -> List[int]: + return char_to_onehot.get(c, empty_vec) + + +def build_dataset(input_path: str) -> np.ndarray: + all_features: List[List[int]] = [] + + with open(input_path, 'r') as input_file: + for line in input_file: + word: str = line.strip().lower() + prev_chars: List[str] = [""] * 10 # Updated: now 10-character context + + for i, curr_char in enumerate(word): + features: List[int] = [] + + # One-hot encode 10 previous characters + for prev in prev_chars: + features.extend(encode_letter(prev)) + + # One-hot encode current character + features.extend(encode_letter(curr_char)) + + # Word position features + is_start: int = 1 if i == 0 else 0 + features.append(is_start) + + prev1: str = prev_chars[-1] + features.append(get_prev_type(prev1)) + + word_length: int = i + 1 + features.append(word_length) + + all_features.append(features) + + # Shift history + prev_chars = prev_chars[1:] + [curr_char] + + return np.array(all_features, dtype=np.int32) + + +if __name__ == "__main__": + dataset: np.ndarray = build_dataset(INPUT_FILE) + np.save(OUTPUT_FILE, dataset) + print(f"Saved dataset shape: {dataset.shape} → {OUTPUT_FILE}")