diff --git a/.gitignore b/.gitignore
index 0dbf2f2..dd60191 100644
--- a/.gitignore
+++ b/.gitignore
@@ -168,3 +168,4 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
+data.npy
\ No newline at end of file
diff --git a/notebook.ipynb b/notebook.ipynb
index 74c23ae..8bff233 100644
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -9,12 +9,46 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
- "import pandas as pd\n",
- "data = pd.read_csv(\"./out.txt\", sep=',')"
+ "import numpy as np\n",
+ "import torch\n",
+ "import torch.nn as nn\n",
+ "from torch.utils.data import DataLoader, TensorDataset, random_split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = np.load(\"./data.npy\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONTEXT_SIZE = 10\n",
+ "ALPHABET = list(\"abcdefghijklmnopqrstuvwxyz\")\n",
+ "ALPHABET_SIZE = len(ALPHABET)\n",
+ "TRAINING_DATA_SIZE = 0.9\n",
+ "\n",
+ "\n",
+ "# Derived values\n",
+ "PREV_LETTER_FEATURES = CONTEXT_SIZE * ALPHABET_SIZE\n",
+ "CURR_LETTER_FEATURES = ALPHABET_SIZE\n",
+ "OTHER_FEATURES = 3 # is_start, prev_type, word_length\n",
+ "\n",
+ "TOTAL_FEATURES = PREV_LETTER_FEATURES + CURR_LETTER_FEATURES + OTHER_FEATURES\n",
+ "\n",
+ "INPUT_SIZE = PREV_LETTER_FEATURES + OTHER_FEATURES\n",
+ "OUTPUT_SIZE = ALPHABET_SIZE"
]
},
{
@@ -33,31 +67,44 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
- "input_features = ['previous_5','previous_4','previous_3','previous_2','previous_1','is_start','previous_type','word_length']\n",
- "target_feature = 'current'"
+ "X = np.hstack([\n",
+ " data[:, :PREV_LETTER_FEATURES],\n",
+ " data[:, PREV_LETTER_FEATURES + CURR_LETTER_FEATURES:TOTAL_FEATURES]\n",
+ "])\n",
+ "\n",
+ "# Extract current letter (one-hot target)\n",
+ "y_onehot = data[:, PREV_LETTER_FEATURES:PREV_LETTER_FEATURES + CURR_LETTER_FEATURES]\n",
+ "y = np.argmax(y_onehot, axis=1)\n",
+ "\n",
+ "# Torch dataset\n",
+ "X_tensor = torch.tensor(X, dtype=torch.float32)\n",
+ "y_tensor = torch.tensor(y, dtype=torch.long)\n",
+ "\n",
+ "dataset = TensorDataset(X_tensor, y_tensor)\n"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.model_selection import train_test_split"
+ "train_len = int(TRAINING_DATA_SIZE * len(dataset))\n",
+ "train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
- "test_size = 0.1 # @param {\"type\":\"number\",\"placeholder\":\"0.1\"}\n",
- "X_train, X_test, y_train, y_test = train_test_split(data[input_features], data[target_feature], test_size=test_size)"
+ "train_loader = DataLoader(train_set, batch_size=128, shuffle=True)\n",
+ "test_loader = DataLoader(test_set, batch_size=128)"
]
},
{
@@ -69,458 +116,109 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.linear_model import LogisticRegression"
+ "class MLP(nn.Module):\n",
+ " def __init__(self):\n",
+ " super().__init__()\n",
+ " self.net = nn.Sequential(\n",
+ " nn.Linear(INPUT_SIZE, 256),\n",
+ " nn.ReLU(),\n",
+ " nn.Linear(256, 128),\n",
+ " nn.ReLU(),\n",
+ " nn.Linear(128, OUTPUT_SIZE)\n",
+ " )\n",
+ "\n",
+ " def forward(self, x):\n",
+ " return self.net(x)"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 80,
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "
LogisticRegression(max_iter=10000, multi_class='multinomial', n_jobs=10,\n",
- " solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
- ],
- "text/plain": [
- "LogisticRegression(max_iter=10000, multi_class='multinomial', n_jobs=10,\n",
- " solver='saga')"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Using device: cuda\n"
+ ]
}
],
"source": [
- "model = LogisticRegression(multi_class=\"multinomial\", solver=\"saga\", max_iter=10_000, n_jobs=10)\n",
- "model.fit(X_train, y_train)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create new model which predicts probability"
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+ "print(f\"Using device: {device}\")"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
- "y_pred = model.predict(X_test)"
+ "model = MLP().to(device)\n",
+ "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
+ "criterion = nn.CrossEntropyLoss()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1, Loss: 4277.8506\n",
+ "Epoch 2, Loss: 3647.3064\n",
+ "Epoch 3, Loss: 3421.2898\n",
+ "Epoch 4, Loss: 3289.9248\n",
+ "Epoch 5, Loss: 3203.0331\n",
+ "Epoch 6, Loss: 3141.4064\n",
+ "Epoch 7, Loss: 3099.4711\n",
+ "Epoch 8, Loss: 3065.2254\n",
+ "Epoch 9, Loss: 3040.1093\n",
+ "Epoch 10, Loss: 3016.0812\n",
+ "Epoch 11, Loss: 2998.2589\n",
+ "Epoch 12, Loss: 2982.5763\n",
+ "Epoch 13, Loss: 2968.7752\n",
+ "Epoch 14, Loss: 2956.6091\n",
+ "Epoch 15, Loss: 2945.3793\n",
+ "Epoch 16, Loss: 2935.6520\n",
+ "Epoch 17, Loss: 2928.2420\n",
+ "Epoch 18, Loss: 2918.6128\n",
+ "Epoch 19, Loss: 2912.0454\n",
+ "Epoch 20, Loss: 2904.7236\n",
+ "Epoch 21, Loss: 2898.5873\n",
+ "Epoch 22, Loss: 2893.1154\n",
+ "Epoch 23, Loss: 2887.1008\n",
+ "Epoch 24, Loss: 2884.5473\n",
+ "Epoch 25, Loss: 2879.1589\n",
+ "Epoch 26, Loss: 2874.9795\n",
+ "Epoch 27, Loss: 2870.3030\n",
+ "Epoch 28, Loss: 2867.0953\n",
+ "Epoch 29, Loss: 2863.1449\n",
+ "Epoch 30, Loss: 2859.8749\n"
+ ]
+ }
+ ],
+ "source": [
+ "for epoch in range(30):\n",
+ " model.train()\n",
+ " total_loss = 0\n",
+ " for batch_X, batch_y in train_loader:\n",
+ " batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
+ " optimizer.zero_grad()\n",
+ " output = model(batch_X)\n",
+ " loss = criterion(output, batch_y)\n",
+ " loss.backward()\n",
+ " optimizer.step()\n",
+ " total_loss += loss.item()\n",
+ " print(f\"Epoch {epoch+1}, Loss: {total_loss:.4f}\")"
]
},
{
@@ -532,43 +230,56 @@
},
{
"cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.metrics import accuracy_score"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "acc = accuracy_score(y_test, y_pred)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
+ "execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Accuracy: 0.211\n"
+ "Top-1 Accuracy: 51.27%\n",
+ "Top-3 Accuracy: 73.68%\n",
+ "Top-5 Accuracy: 82.94%\n"
]
}
],
"source": [
- "print(f\"Accuracy: {acc:.3f}\")"
+ "model.eval()\n",
+ "correct_top1 = 0\n",
+ "correct_top3 = 0\n",
+ "correct_top5 = 0\n",
+ "total = 0\n",
+ "\n",
+ "with torch.no_grad():\n",
+ " for batch_X, batch_y in test_loader:\n",
+ " batch_X, batch_y = batch_X.to(device), batch_y.to(device)\n",
+ " outputs = model(batch_X) # shape: [batch_size, 26]\n",
+ "\n",
+ " # Get top-5 predictions\n",
+ " _, top_preds = outputs.topk(5, dim=1) # shape: [batch_size, 5]\n",
+ "\n",
+ " for true, top5 in zip(batch_y, top_preds):\n",
+ " total += 1\n",
+ " if true == top5[0]:\n",
+ " correct_top1 += 1\n",
+ " if true in top5[:3]:\n",
+ " correct_top3 += 1\n",
+ " if true in top5:\n",
+ " correct_top5 += 1\n",
+ "\n",
+ "top1_acc = correct_top1 / total\n",
+ "top3_acc = correct_top3 / total\n",
+ "top5_acc = correct_top5 / total\n",
+ "\n",
+ "print(f\"Top-1 Accuracy: {top1_acc * 100:.2f}%\")\n",
+ "print(f\"Top-3 Accuracy: {top3_acc * 100:.2f}%\")\n",
+ "print(f\"Top-5 Accuracy: {top5_acc * 100:.2f}%\")\n"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": ".venv",
"language": "python",
"name": "python3"
},
diff --git a/transform.py b/transform.py
new file mode 100755
index 0000000..b7d6b01
--- /dev/null
+++ b/transform.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+from typing import Literal, List, Dict
+import numpy as np
+
+INPUT_FILE: str = "./data/all_cleaned_words.txt"
+OUTPUT_FILE: str = "./data.npy"
+
+alphabet: List[str] = list("abcdefghijklmnopqrstuvwxyz")
+vowels: set[str] = set("aeiouy")
+
+char_to_onehot: Dict[str, List[int]] = {
+ ch: [1 if i == idx else 0 for i in range(26)]
+ for idx, ch in enumerate(alphabet)
+}
+empty_vec: List[int] = [0] * 26
+
+
+def get_prev_type(c: str) -> Literal[0, 1, 2]:
+ if c in vowels:
+ return 1
+ elif c in alphabet:
+ return 2
+ return 0
+
+
+def encode_letter(c: str) -> List[int]:
+ return char_to_onehot.get(c, empty_vec)
+
+
+def build_dataset(input_path: str) -> np.ndarray:
+ all_features: List[List[int]] = []
+
+ with open(input_path, 'r') as input_file:
+ for line in input_file:
+ word: str = line.strip().lower()
+ prev_chars: List[str] = [""] * 10 # Updated: now 10-character context
+
+ for i, curr_char in enumerate(word):
+ features: List[int] = []
+
+ # One-hot encode 10 previous characters
+ for prev in prev_chars:
+ features.extend(encode_letter(prev))
+
+ # One-hot encode current character
+ features.extend(encode_letter(curr_char))
+
+ # Word position features
+ is_start: int = 1 if i == 0 else 0
+ features.append(is_start)
+
+ prev1: str = prev_chars[-1]
+ features.append(get_prev_type(prev1))
+
+ word_length: int = i + 1
+ features.append(word_length)
+
+ all_features.append(features)
+
+ # Shift history
+ prev_chars = prev_chars[1:] + [curr_char]
+
+ return np.array(all_features, dtype=np.int32)
+
+
+if __name__ == "__main__":
+ dataset: np.ndarray = build_dataset(INPUT_FILE)
+ np.save(OUTPUT_FILE, dataset)
+ print(f"Saved dataset shape: {dataset.shape} → {OUTPUT_FILE}")