#!/usr/bin/env python3 import argparse import lzma import os import pickle import sys import urllib.request import numpy as np from sklearn.compose import ColumnTransformer from sklearn.discriminant_analysis import StandardScaler from sklearn.linear_model import LogisticRegression, RidgeClassifierCV from sklearn.metrics import accuracy_score from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, RobustScaler, SplineTransformer class Dataset: """Dataset pacientů Dataset obsahuje tyto informace (v tomto pořadí): - 15 binárních atributů - 6 číselných atributů Cílem je predikovat, zda pacient má poruchu štítné žlázy (1) nebo ne (2). """ def __init__(self, name="dataset36-3-S.npz", url="https://ksp.mff.cuni.cz/h/ulohy/36/36-3-S/competition-datasets/"): if not os.path.exists(name): print("Downloading dataset {}...".format(name), file=sys.stderr) urllib.request.urlretrieve(url + name, filename=name) # načtení serializovaného datasetu dataset = np.load(name, allow_pickle=True) self.train_data = dataset['train_data'] self.test_data = dataset['test_data'] self.train_target = dataset['train_target'] # pozor: obsahuje vektor -1 self.test_target = dataset['test_target'] parser = argparse.ArgumentParser() parser.add_argument("--seed", default=42, type=int, help="Random seed") def main(args): # nastavení seedu np.random.seed(args.seed) # načtení datasetu dataset = Dataset() # TODO: Natrénujte model pipeline = Pipeline([ ("column_transformer", ColumnTransformer([ ("one_hot", OneHotEncoder(handle_unknown='ignore'), list(range(15))), # ("robust_scaler", RobustScaler(), list(range(15,21))), # ("standard_scaler", StandardScaler(), list(range(15,21))), ("spline", SplineTransformer(n_knots=30, degree=3, knots='quantile'), list(range(15,21))), ], remainder="passthrough")), ("polynomial_features", PolynomialFeatures(degree=2, include_bias=False)), ('clf', LogisticRegression()) # ("clf", RidgeClassifierCV(alphas=[1e-3, 5e-2, 1e-2, 5e-1, 0.1, 1.0, 10.0, 100, 1000, 10000])), ]) # pipeline = GridSearchCV(pipeline, cv=5, n_jobs=8, verbose=1, param_grid={ # 'column_transformer__spline__n_knots': [5, 10, 15, 20, 30], # 'column_transformer__spline__degree': [3, 4, 5], # 'column_transformer__spline__knots': ['uniform', 'quantile'], # # 'column_transformer__spline__extrapolation': ['constant', 'linear'], # }) pipeline.fit(dataset.train_data, dataset.train_target) # for idx, param in enumerate(pipeline.cv_results_['params']): # print( # 'Rank', pipeline.cv_results_['rank_test_score'][idx], # f"Cross-val {pipeline.cv_results_['mean_test_score'][idx] * 100:.1f}%", # param # ) # print('=========') # print(pipeline.best_params_) train_pred = pipeline.predict(dataset.train_data) test_pred = pipeline.predict(dataset.test_data) print(accuracy_score(dataset.train_target, train_pred)) # Pokud si budete chtít uložit model (nebo celou pipeline), # se vám může hodit toto: # with lzma.open("competition.model", "wb") as model_file: # pickle.dump(model, model_file) # Poté načtení modelu uděláte takto: # with lzma.open("competition.model", "rb") as model_file: # model = pickle.load(model_file) pred = test_pred with open("36-3-S-prediction.txt", "w") as prediction_file: for p in pred: print(p, file=prediction_file) if __name__ == "__main__": args = parser.parse_args() main(args)