# Import data

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split

In [2]:
data = np.load("./data.npy")

In [3]:
CONTEXT_SIZE = 10
ALPHABET = list("abcdefghijklmnopqrstuvwxyz")
ALPHABET_SIZE = len(ALPHABET)
TRAINING_DATA_SIZE = 0.9

VOCAB_SIZE = ALPHABET_SIZE + 1 # 26 letters + 1 for unknown
EMBEDDING_DIM = 16

INPUT_SEQ_LEN = CONTEXT_SIZE
OUTPUT_SIZE = VOCAB_SIZE

# Define and split data

## Define input and output columns

In [4]:
X = data[:, :CONTEXT_SIZE] # shape: (num_samples, CONTEXT_SIZE)

# Target: current letter index
y = data[:, CONTEXT_SIZE] # shape: (num_samples,)

# Torch dataset (important: use long/int64 for indices)
X_tensor = torch.tensor(X, dtype=torch.long) # for nn.Embedding
y_tensor = torch.tensor(y, dtype=torch.long) # for classification target

dataset = TensorDataset(X_tensor, y_tensor)

In [5]:
train_len = int(TRAINING_DATA_SIZE * len(dataset))
train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])

In [6]:
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
test_loader = DataLoader(test_set, batch_size=128)

# Train on data

In [None]:
class MLP(nn.Module):
 def __init__(self):
 super().__init__()
 self.net = nn.Sequential(
 nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM),
 nn.Flatten(),
 nn.Linear(CONTEXT_SIZE * EMBEDDING_DIM, 256),
 nn.ReLU(),
 nn.Linear(256, 128),
 nn.ReLU(),
 nn.Linear(128, OUTPUT_SIZE)
 )

 def forward(self, x):
 return self.net(x)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


 return torch._C._cuda_getDeviceCount() > 0


In [9]:
model = MLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

criterion = nn.CrossEntropyLoss()

In [10]:
for epoch in range(30):
 model.train()
 total_loss = 0
 for batch_X, batch_y in train_loader:
 batch_X, batch_y = batch_X.to(device), batch_y.to(device)
 optimizer.zero_grad()
 output = model(batch_X)
 loss = criterion(output, batch_y)
 loss.backward()
 optimizer.step()
 total_loss += loss.item()
 print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 4068.5562
Epoch 2, Loss: 3446.1109
Epoch 3, Loss: 3260.1651
Epoch 4, Loss: 3165.0248
Epoch 5, Loss: 3101.6501
Epoch 6, Loss: 3054.4113
Epoch 7, Loss: 3021.7103
Epoch 8, Loss: 2994.6145
Epoch 9, Loss: 2973.1683
Epoch 10, Loss: 2955.0090
Epoch 11, Loss: 2940.0807
Epoch 12, Loss: 2928.2814
Epoch 13, Loss: 2916.9362
Epoch 14, Loss: 2905.9567
Epoch 15, Loss: 2897.3687
Epoch 16, Loss: 2890.6869
Epoch 17, Loss: 2882.7104
Epoch 18, Loss: 2876.6815
Epoch 19, Loss: 2870.7298
Epoch 20, Loss: 2865.6343
Epoch 21, Loss: 2860.5506
Epoch 22, Loss: 2856.7977
Epoch 23, Loss: 2852.8814
Epoch 24, Loss: 2847.7687
Epoch 25, Loss: 2846.0855
Epoch 26, Loss: 2842.2640
Epoch 27, Loss: 2838.4780
Epoch 28, Loss: 2836.9773
Epoch 29, Loss: 2833.8416
Epoch 30, Loss: 2830.5508


# Testing model

In [12]:
model.eval()
correct_top1 = 0
correct_top3 = 0
correct_top5 = 0
total = 0

with torch.no_grad():
 for batch_X, batch_y in test_loader:
 batch_X, batch_y = batch_X.to(device), batch_y.to(device)
 outputs = model(batch_X) # shape: [batch_size, 26]

 # Get top-5 predictions
 _, top_preds = outputs.topk(5, dim=1) # shape: [batch_size, 5]

 for true, top5 in zip(batch_y, top_preds):
 total += 1
 if true == top5[0]:
 correct_top1 += 1
 if true in top5[:3]:
 correct_top3 += 1
 if true in top5:
 correct_top5 += 1

top1_acc = correct_top1 / total
top3_acc = correct_top3 / total
top5_acc = correct_top5 / total

print(f"Top-1 Accuracy: {top1_acc * 100:.2f}%")
print(f"Top-3 Accuracy: {top3_acc * 100:.2f}%")
print(f"Top-5 Accuracy: {top5_acc * 100:.2f}%")


Top-1 Accuracy: 52.77%
Top-3 Accuracy: 74.39%
Top-5 Accuracy: 83.37%


In [38]:
torch.save(model.state_dict(), "mlp_weights.pth")

In [None]:
torch.save(model, "mlp_full_model.pth")

In [18]:
# Reuse same alphabet + mapping
alphabet = list("abcdefghijklmnopqrstuvwxyz")
char_to_idx = {ch: idx for idx, ch in enumerate(alphabet)}
PAD_IDX = len(alphabet) # index 26 for OOV/padding
VOCAB_SIZE = len(alphabet) + 1 # 27 total (a–z + padding)
CONTEXT_SIZE = 10

idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}
idx_to_char[PAD_IDX] = "_" # for readability

def preprocess_input(context: str) -> torch.Tensor:
 context = context.lower()
 padded = context.rjust(CONTEXT_SIZE, "_") # pad with underscores (or any 1-char symbol)

 indices = []
 for ch in padded[-CONTEXT_SIZE:]:
 idx = char_to_idx.get(ch, PAD_IDX) # if '_' or unknown → PAD_IDX (26)
 indices.append(idx)

 return torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)


def predict_next_chars(model, context: str, top_k=5):
 model.eval()
 input_tensor = preprocess_input(context)
 with torch.no_grad():
 logits = model(input_tensor)
 probs = torch.softmax(logits, dim=-1)
 top_probs, top_indices = probs.topk(top_k, dim=-1)

 predictions = [(idx_to_char[idx.item()], top_probs[0, i].item()) for i, idx in enumerate(top_indices[0])]
 return predictions


In [37]:
preds = predict_next_chars(model, "doors")
for char, prob in preds:
 print(f"{char.upper()}: {prob:.4f}")


A: 0.4302
T: 0.2897
E: 0.1538
I: 0.0905
C: 0.0159
