omega/model/transform.py
2025-03-31 23:58:19 +02:00

64 lines
1.9 KiB
Python
Executable File

#!/usr/bin/env python3
from typing import Literal, List, Dict
import numpy as np
INPUT_FILE: str = "./data/all_cleaned_words.txt"
OUTPUT_FILE: str = "./data.npy"
alphabet: List[str] = list("abcdefghijklmnopqrstuvwxyz")
vowels: set[str] = set("aeiouy")
char_to_index: Dict[str, int] = {ch: idx for idx, ch in enumerate(alphabet)}
default_index: int = len(alphabet) # Out-of-vocabulary token (e.g., for "")
def get_prev_type(c: str) -> Literal[0, 1, 2]:
if c in vowels:
return 1
elif c in alphabet:
return 2
return 0
def encode_letter(c: str) -> int:
return char_to_index.get(c, default_index)
def build_dataset(input_path: str) -> np.ndarray:
all_features: List[List[int]] = []
with open(input_path, 'r') as input_file:
for line in input_file:
word: str = line.strip().lower()
prev_chars: List[str] = [""] * 10
for i, curr_char in enumerate(word):
features: List[int] = []
# Use indices instead of one-hot for previous 10 characters
for prev in prev_chars:
features.append(encode_letter(prev))
# Append current char index (target for classification)
features.append(encode_letter(curr_char))
# Word position features
# is_start: int = 1 if i == 0 else 0
prev1: str = prev_chars[-1]
prev_type: int = get_prev_type(prev1)
# word_length: int = i + 1
# features.extend([prev_type])
all_features.append(features)
prev_chars = prev_chars[1:] + [curr_char]
return np.array(all_features, dtype=np.int32)
if __name__ == "__main__":
dataset: np.ndarray = build_dataset(INPUT_FILE)
np.save(OUTPUT_FILE, dataset)
print(f"Saved dataset shape: {dataset.shape}{OUTPUT_FILE}")