#!/usr/bin/env python3 from typing import Literal, List, Dict import numpy as np INPUT_FILE: str = "./data/all_cleaned_words.txt" OUTPUT_FILE: str = "./data.npy" alphabet: List[str] = list("abcdefghijklmnopqrstuvwxyz") vowels: set[str] = set("aeiouy") char_to_index: Dict[str, int] = {ch: idx for idx, ch in enumerate(alphabet)} default_index: int = len(alphabet) # Out-of-vocabulary token (e.g., for "") def get_prev_type(c: str) -> Literal[0, 1, 2]: if c in vowels: return 1 elif c in alphabet: return 2 return 0 def encode_letter(c: str) -> int: return char_to_index.get(c, default_index) def build_dataset(input_path: str) -> np.ndarray: all_features: List[List[int]] = [] with open(input_path, 'r') as input_file: for line in input_file: word: str = line.strip().lower() prev_chars: List[str] = [""] * 10 for i, curr_char in enumerate(word): features: List[int] = [] # Use indices instead of one-hot for previous 10 characters for prev in prev_chars: features.append(encode_letter(prev)) # Append current char index (target for classification) features.append(encode_letter(curr_char)) # Word position features # is_start: int = 1 if i == 0 else 0 prev1: str = prev_chars[-1] prev_type: int = get_prev_type(prev1) # word_length: int = i + 1 # features.extend([prev_type]) all_features.append(features) prev_chars = prev_chars[1:] + [curr_char] return np.array(all_features, dtype=np.int32) if __name__ == "__main__": dataset: np.ndarray = build_dataset(INPUT_FILE) np.save(OUTPUT_FILE, dataset) print(f"Saved dataset shape: {dataset.shape} → {OUTPUT_FILE}")