Added notebook for training the model and the data itself
This commit is contained in:
parent
ba1f05a1af
commit
13884a933b
16
README.md
16
README.md
@ -1,5 +1,21 @@
|
||||
# omega
|
||||
|
||||
## Documentation
|
||||
First I gathered a couple long text source, like the GNU GPL license, Wikipedia articles, or even a book.
|
||||
|
||||
Those were transformed into a large text file [see all_words.txt](data/all_words.txt) using the following command
|
||||
|
||||
```
|
||||
grep -o "[[:alpha:]]\{1,\}" "path_to_individual_source.txt" | tr '[:upper:]' '[:lower:]'
|
||||
```
|
||||
|
||||
Which simply finds words at least 1 character long and unifies them by transforming them all to lowercase.
|
||||
|
||||
For the model to have as much accuracy as possible, I calculated the average word length (5.819) and went with character history of 5 letters. This is for now the norm and can easily be omitted from the data if it becomes excessive
|
||||
```
|
||||
awk '{ total += length; count++ } END { if (count > 0) print total / count }' 1000_words.txt
|
||||
```
|
||||
|
||||
## Sources
|
||||
1. Generic news articles
|
||||
- https://edition.cnn.com/2025/03/20/middleeast/ronen-bar-shin-bet-israel-vote-dismiss-intl-latam/index.html
|
||||
|
590
notebook.ipynb
Normal file
590
notebook.ipynb
Normal file
@ -0,0 +1,590 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Import data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"data = pd.read_csv(\"./out.txt\", sep=',')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Define and split data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Define input and output columns"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_features = ['previous_5','previous_4','previous_3','previous_2','previous_1','is_start','previous_type','word_length']\n",
|
||||
"target_feature = 'current'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_size = 0.1 # @param {\"type\":\"number\",\"placeholder\":\"0.1\"}\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data[input_features], data[target_feature], test_size=test_size)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train on data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.linear_model import LogisticRegression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<style>#sk-container-id-9 {\n",
|
||||
" /* Definition of color scheme common for light and dark mode */\n",
|
||||
" --sklearn-color-text: black;\n",
|
||||
" --sklearn-color-line: gray;\n",
|
||||
" /* Definition of color scheme for unfitted estimators */\n",
|
||||
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
|
||||
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
|
||||
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
|
||||
" --sklearn-color-unfitted-level-3: chocolate;\n",
|
||||
" /* Definition of color scheme for fitted estimators */\n",
|
||||
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
|
||||
" --sklearn-color-fitted-level-1: #d4ebff;\n",
|
||||
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
|
||||
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
|
||||
"\n",
|
||||
" /* Specific color for light theme */\n",
|
||||
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
|
||||
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
|
||||
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
|
||||
" --sklearn-color-icon: #696969;\n",
|
||||
"\n",
|
||||
" @media (prefers-color-scheme: dark) {\n",
|
||||
" /* Redefinition of color scheme for dark theme */\n",
|
||||
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
|
||||
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
|
||||
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
|
||||
" --sklearn-color-icon: #878787;\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 pre {\n",
|
||||
" padding: 0;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 input.sk-hidden--visually {\n",
|
||||
" border: 0;\n",
|
||||
" clip: rect(1px 1px 1px 1px);\n",
|
||||
" clip: rect(1px, 1px, 1px, 1px);\n",
|
||||
" height: 1px;\n",
|
||||
" margin: -1px;\n",
|
||||
" overflow: hidden;\n",
|
||||
" padding: 0;\n",
|
||||
" position: absolute;\n",
|
||||
" width: 1px;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-dashed-wrapped {\n",
|
||||
" border: 1px dashed var(--sklearn-color-line);\n",
|
||||
" margin: 0 0.4em 0.5em 0.4em;\n",
|
||||
" box-sizing: border-box;\n",
|
||||
" padding-bottom: 0.4em;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-container {\n",
|
||||
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
|
||||
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
|
||||
" so we also need the `!important` here to be able to override the\n",
|
||||
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
|
||||
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
|
||||
" display: inline-block !important;\n",
|
||||
" position: relative;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-text-repr-fallback {\n",
|
||||
" display: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"div.sk-parallel-item,\n",
|
||||
"div.sk-serial,\n",
|
||||
"div.sk-item {\n",
|
||||
" /* draw centered vertical line to link estimators */\n",
|
||||
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
|
||||
" background-size: 2px 100%;\n",
|
||||
" background-repeat: no-repeat;\n",
|
||||
" background-position: center center;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Parallel-specific style estimator block */\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-parallel-item::after {\n",
|
||||
" content: \"\";\n",
|
||||
" width: 100%;\n",
|
||||
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
|
||||
" flex-grow: 1;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-parallel {\n",
|
||||
" display: flex;\n",
|
||||
" align-items: stretch;\n",
|
||||
" justify-content: center;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" position: relative;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-parallel-item {\n",
|
||||
" display: flex;\n",
|
||||
" flex-direction: column;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-parallel-item:first-child::after {\n",
|
||||
" align-self: flex-end;\n",
|
||||
" width: 50%;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-parallel-item:last-child::after {\n",
|
||||
" align-self: flex-start;\n",
|
||||
" width: 50%;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-parallel-item:only-child::after {\n",
|
||||
" width: 0;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Serial-specific style estimator block */\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-serial {\n",
|
||||
" display: flex;\n",
|
||||
" flex-direction: column;\n",
|
||||
" align-items: center;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" padding-right: 1em;\n",
|
||||
" padding-left: 1em;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
|
||||
"clickable and can be expanded/collapsed.\n",
|
||||
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
|
||||
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
|
||||
"*/\n",
|
||||
"\n",
|
||||
"/* Pipeline and ColumnTransformer style (default) */\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-toggleable {\n",
|
||||
" /* Default theme specific background. It is overwritten whether we have a\n",
|
||||
" specific estimator or a Pipeline/ColumnTransformer */\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Toggleable label */\n",
|
||||
"#sk-container-id-9 label.sk-toggleable__label {\n",
|
||||
" cursor: pointer;\n",
|
||||
" display: block;\n",
|
||||
" width: 100%;\n",
|
||||
" margin-bottom: 0;\n",
|
||||
" padding: 0.5em;\n",
|
||||
" box-sizing: border-box;\n",
|
||||
" text-align: center;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 label.sk-toggleable__label-arrow:before {\n",
|
||||
" /* Arrow on the left of the label */\n",
|
||||
" content: \"▸\";\n",
|
||||
" float: left;\n",
|
||||
" margin-right: 0.25em;\n",
|
||||
" color: var(--sklearn-color-icon);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 label.sk-toggleable__label-arrow:hover:before {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Toggleable content - dropdown */\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-toggleable__content {\n",
|
||||
" max-height: 0;\n",
|
||||
" max-width: 0;\n",
|
||||
" overflow: hidden;\n",
|
||||
" text-align: left;\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-toggleable__content.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-toggleable__content pre {\n",
|
||||
" margin: 0.2em;\n",
|
||||
" border-radius: 0.25em;\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-toggleable__content.fitted pre {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
|
||||
" /* Expand drop-down */\n",
|
||||
" max-height: 200px;\n",
|
||||
" max-width: 100%;\n",
|
||||
" overflow: auto;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
|
||||
" content: \"▾\";\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Pipeline/ColumnTransformer-specific style */\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Estimator-specific style */\n",
|
||||
"\n",
|
||||
"/* Colorize estimator box */\n",
|
||||
"#sk-container-id-9 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-label label.sk-toggleable__label,\n",
|
||||
"#sk-container-id-9 div.sk-label label {\n",
|
||||
" /* The background is the default theme color */\n",
|
||||
" color: var(--sklearn-color-text-on-default-background);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* On hover, darken the color of the background */\n",
|
||||
"#sk-container-id-9 div.sk-label:hover label.sk-toggleable__label {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Label box, darken color on hover, fitted */\n",
|
||||
"#sk-container-id-9 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Estimator label */\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-label label {\n",
|
||||
" font-family: monospace;\n",
|
||||
" font-weight: bold;\n",
|
||||
" display: inline-block;\n",
|
||||
" line-height: 1.2em;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-label-container {\n",
|
||||
" text-align: center;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Estimator-specific */\n",
|
||||
"#sk-container-id-9 div.sk-estimator {\n",
|
||||
" font-family: monospace;\n",
|
||||
" border: 1px dotted var(--sklearn-color-border-box);\n",
|
||||
" border-radius: 0.25em;\n",
|
||||
" box-sizing: border-box;\n",
|
||||
" margin-bottom: 0.5em;\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-estimator.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* on hover */\n",
|
||||
"#sk-container-id-9 div.sk-estimator:hover {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 div.sk-estimator.fitted:hover {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
|
||||
"\n",
|
||||
"/* Common style for \"i\" and \"?\" */\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link,\n",
|
||||
"a:link.sk-estimator-doc-link,\n",
|
||||
"a:visited.sk-estimator-doc-link {\n",
|
||||
" float: right;\n",
|
||||
" font-size: smaller;\n",
|
||||
" line-height: 1em;\n",
|
||||
" font-family: monospace;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" border-radius: 1em;\n",
|
||||
" height: 1em;\n",
|
||||
" width: 1em;\n",
|
||||
" text-decoration: none !important;\n",
|
||||
" margin-left: 1ex;\n",
|
||||
" /* unfitted */\n",
|
||||
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
|
||||
" color: var(--sklearn-color-unfitted-level-1);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link.fitted,\n",
|
||||
"a:link.sk-estimator-doc-link.fitted,\n",
|
||||
"a:visited.sk-estimator-doc-link.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
|
||||
" color: var(--sklearn-color-fitted-level-1);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* On hover */\n",
|
||||
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
|
||||
".sk-estimator-doc-link:hover,\n",
|
||||
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
|
||||
".sk-estimator-doc-link:hover {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-3);\n",
|
||||
" color: var(--sklearn-color-background);\n",
|
||||
" text-decoration: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
|
||||
".sk-estimator-doc-link.fitted:hover,\n",
|
||||
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
|
||||
".sk-estimator-doc-link.fitted:hover {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-3);\n",
|
||||
" color: var(--sklearn-color-background);\n",
|
||||
" text-decoration: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Span, style for the box shown on hovering the info icon */\n",
|
||||
".sk-estimator-doc-link span {\n",
|
||||
" display: none;\n",
|
||||
" z-index: 9999;\n",
|
||||
" position: relative;\n",
|
||||
" font-weight: normal;\n",
|
||||
" right: .2ex;\n",
|
||||
" padding: .5ex;\n",
|
||||
" margin: .5ex;\n",
|
||||
" width: min-content;\n",
|
||||
" min-width: 20ex;\n",
|
||||
" max-width: 50ex;\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" box-shadow: 2pt 2pt 4pt #999;\n",
|
||||
" /* unfitted */\n",
|
||||
" background: var(--sklearn-color-unfitted-level-0);\n",
|
||||
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link.fitted span {\n",
|
||||
" /* fitted */\n",
|
||||
" background: var(--sklearn-color-fitted-level-0);\n",
|
||||
" border: var(--sklearn-color-fitted-level-3);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link:hover span {\n",
|
||||
" display: block;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 a.estimator_doc_link {\n",
|
||||
" float: right;\n",
|
||||
" font-size: 1rem;\n",
|
||||
" line-height: 1em;\n",
|
||||
" font-family: monospace;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" border-radius: 1rem;\n",
|
||||
" height: 1rem;\n",
|
||||
" width: 1rem;\n",
|
||||
" text-decoration: none;\n",
|
||||
" /* unfitted */\n",
|
||||
" color: var(--sklearn-color-unfitted-level-1);\n",
|
||||
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 a.estimator_doc_link.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
|
||||
" color: var(--sklearn-color-fitted-level-1);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* On hover */\n",
|
||||
"#sk-container-id-9 a.estimator_doc_link:hover {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-3);\n",
|
||||
" color: var(--sklearn-color-background);\n",
|
||||
" text-decoration: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-9 a.estimator_doc_link.fitted:hover {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-3);\n",
|
||||
"}\n",
|
||||
"</style><div id=\"sk-container-id-9\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression(max_iter=10000, multi_class='multinomial', n_jobs=10,\n",
|
||||
" solver='saga')</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-9\" type=\"checkbox\" checked><label for=\"sk-estimator-id-9\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression(max_iter=10000, multi_class='multinomial', n_jobs=10,\n",
|
||||
" solver='saga')</pre></div> </div></div></div></div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"LogisticRegression(max_iter=10000, multi_class='multinomial', n_jobs=10,\n",
|
||||
" solver='saga')"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = LogisticRegression(multi_class=\"multinomial\", solver=\"saga\", max_iter=10_000, n_jobs=10)\n",
|
||||
"model.fit(X_train, y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create new model which predicts probability"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Testing model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import accuracy_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"acc = accuracy_score(y_test, y_pred)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Accuracy: 0.211\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(f\"Accuracy: {acc:.3f}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
72
transform.sh
72
transform.sh
@ -1,35 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
FILE_PATH="./words.txt"
|
||||
|
||||
FILE_PATH="./1000_words.txt"
|
||||
OUT_FILE_PATH="./out.txt"
|
||||
vowels="aeiouy"
|
||||
|
||||
while read -r line; do
|
||||
prev=""
|
||||
prevprev=""
|
||||
|
||||
for (( i=0; i<${#line}; i++ )); do
|
||||
printf "previous_5,previous_4,previous_3,previous_2,previous_1,current,is_start,previous_type,word_length\n" > "$OUT_FILE_PATH"
|
||||
|
||||
while read -r line; do
|
||||
prev_5=""
|
||||
prev_4=""
|
||||
prev_3=""
|
||||
prev_2=""
|
||||
prev_1=""
|
||||
|
||||
for (( i=0; i<"${#line}"; i++ )); do
|
||||
word_length="$((i + 1))"
|
||||
curr="${line:$i:1}"
|
||||
|
||||
# Convert all to lowercase
|
||||
curr_lower=$(echo "$curr" | tr 'A-Z' 'a-z')
|
||||
prev_lower=$(echo "$prev" | tr 'A-Z' 'a-z')
|
||||
prevprev_lower=$(echo "$prevprev" | tr 'A-Z' 'a-z')
|
||||
p1_lower=$(echo "$prev_1" | tr 'A-Z' 'a-z')
|
||||
p2_lower=$(echo "$prev_2" | tr 'A-Z' 'a-z')
|
||||
p3_lower=$(echo "$prev_3" | tr 'A-Z' 'a-z')
|
||||
p4_lower=$(echo "$prev_4" | tr 'A-Z' 'a-z')
|
||||
p5_lower=$(echo "$prev_5" | tr 'A-Z' 'a-z')
|
||||
|
||||
curr_val=$(printf "%d" "'$curr_lower")
|
||||
prev_val=0
|
||||
prevprev_val=0
|
||||
# Convert to ASCII values (default to 0 if empty)
|
||||
val_p5=0; [ -n "$p5_lower" ] && val_p5=$(printf "%d" "'$p5_lower")
|
||||
val_p4=0; [ -n "$p4_lower" ] && val_p4=$(printf "%d" "'$p4_lower")
|
||||
val_p3=0; [ -n "$p3_lower" ] && val_p3=$(printf "%d" "'$p3_lower")
|
||||
val_p2=0; [ -n "$p2_lower" ] && val_p2=$(printf "%d" "'$p2_lower")
|
||||
val_p1=0; [ -n "$p1_lower" ] && val_p1=$(printf "%d" "'$p1_lower")
|
||||
val_curr=$(printf "%d" "'$curr_lower")
|
||||
|
||||
if [ -n "$prev_lower" ]; then
|
||||
prev_val=$(printf "%d" "'$prev_lower")
|
||||
fi
|
||||
if [ -n "$prevprev_lower" ]; then
|
||||
prevprev_val=$(printf "%d" "'$prevprev_lower")
|
||||
# Determine if this is the start of the word
|
||||
is_start=0
|
||||
[ "$i" -eq 0 ] && is_start=1
|
||||
|
||||
# Determine if prev_1 is vowel or consonant
|
||||
if [[ "$p1_lower" =~ ^[a-z]$ ]]; then
|
||||
if [[ "$vowels" == *"$p1_lower"* ]]; then
|
||||
prev_type="1"
|
||||
else
|
||||
prev_type="2"
|
||||
fi
|
||||
else
|
||||
prev_type="0"
|
||||
fi
|
||||
|
||||
printf "%d-%d-%d " "$prevprev_val" "$prev_val" "$curr_val"
|
||||
# Output CSV line
|
||||
printf "%d,%d,%d,%d,%d,%d,%d,%d,%d\n" \
|
||||
"$val_p5" "$val_p4" "$val_p3" "$val_p2" "$val_p1" "$val_curr" \
|
||||
"$is_start" "$prev_type" "$word_length" \
|
||||
>> "$OUT_FILE_PATH"
|
||||
|
||||
prevprev="$prev"
|
||||
prev="$curr"
|
||||
# Shift history
|
||||
prev_5="$prev_4"
|
||||
prev_4="$prev_3"
|
||||
prev_3="$prev_2"
|
||||
prev_2="$prev_1"
|
||||
prev_1="$curr"
|
||||
done
|
||||
echo
|
||||
done < "$FILE_PATH"
|
||||
done < "$FILE_PATH"
|
||||
|
Loading…
x
Reference in New Issue
Block a user