#!/usr/bin/env python3 import argparse import lzma import os import pickle import sys import urllib.request import numpy as np import sklearn import sklearn.linear_model import sklearn.metrics from sklearn.model_selection import GridSearchCV import sklearn.neural_network import sklearn.ensemble import sklearn.neighbors import sklearn.preprocessing class Dataset: """Dataset databáze spamu Dataset obsahuje tyto informace: - frekvence výskytu vybraných 48 slov v e-mailech (slova: 'make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet', 'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table', 'conference') - frekvence výskytu vybraných 6 znaků v e-mailech (znaky: ';', '(', '[', '!', '$', '#') - průměrná délka souvislé sekvence velkých písmen - nejdelší souvislá sekvence velkých písmen - celkový počet velkých písmen Cílem je predikovat, zda je e-mail spam nebo ne. """ def __init__(self, name="dataset36-5-S.npz", url="https://ksp.mff.cuni.cz/h/ulohy/36/36-5-S/competition-datasets/"): if not os.path.exists(name): print("Downloading dataset {}...".format(name), file=sys.stderr) urllib.request.urlretrieve(url + name, filename=name) # načtení serializovaného datasetu dataset = np.load(name) self.train_data = dataset['train_data'] self.test_data = dataset['test_data'] self.train_target = dataset['train_target'] # pozor: obsahuje vektor -1 self.test_target = dataset['test_target'] parser = argparse.ArgumentParser() parser.add_argument("--seed", default=42, type=int, help="Random seed") parser.add_argument("--first_test", default=False, action="store_true", help="Run first test") parser.add_argument("--random_forest_grid", default=False, action="store_true", help="Run random forest grid search") parser.add_argument("--gradient_boosting_grid", default=False, action="store_true", help="Run gradient boosting grid search") def main(args): # nastavení seedu np.random.seed(args.seed) # načtení datasetu dataset = Dataset() train_data = dataset.train_data test_data = dataset.test_data train_target = dataset.train_target test_target = dataset.test_target def eval_model(str_name, model): print(str_name) predict_train = model.predict(train_data) # predict_test = model.predict(test_data) print(f"Train: {sklearn.metrics.accuracy_score(train_target, predict_train)}") if args.first_test: LogisticRegression = sklearn.linear_model.LogisticRegression("l2",solver="liblinear") LogisticRegression.fit(train_data, train_target) eval_model("LR", LogisticRegression) NN = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(100, 50)) NN.fit(train_data, train_target) eval_model("NN", NN) random_forest = sklearn.ensemble.RandomForestClassifier(n_estimators=2000) random_forest.fit(train_data, train_target) eval_model("Random forest", random_forest) gradient_boosting = sklearn.ensemble.GradientBoostingClassifier(n_estimators=2000) gradient_boosting.fit(train_data, train_target) eval_model("Grad booting", gradient_boosting) knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=3) knn.fit(train_data, train_target) eval_model("KNN", knn) pipeline = sklearn.pipeline.Pipeline([ # ("poly", sklearn.preprocessing.PolynomialFeatures(2)), ("scaler", sklearn.preprocessing.RobustScaler()) ]) train_data = pipeline.fit_transform(train_data) test_data = pipeline.transform(test_data) if args.random_forest_grid: random_forest = sklearn.ensemble.RandomForestClassifier(n_estimators=3000) pipeline = sklearn.pipeline.Pipeline([ ('clf', random_forest) ]) pipeline = GridSearchCV(pipeline, cv=5, n_jobs=8, verbose=1, param_grid={ 'clf__n_estimators': [100, 1000, 2000, 3000], 'clf__criterion': ['gini', 'entropy', 'log_loss'], 'clf__min_samples_split': [2, 10, 40], 'clf__max_depth': [None, 4, 8], }) pipeline.fit(dataset.train_data, dataset.train_target) for idx, param in enumerate(pipeline.cv_results_['params']): print( 'Rank', pipeline.cv_results_['rank_test_score'][idx], f"Cross-val {pipeline.cv_results_['mean_test_score'][idx] * 100:.1f}%", param ) print('=========') print(pipeline.best_params_) eval_model("Random forest", pipeline) if args.gradient_boosting_grid: gradient_boosting = sklearn.ensemble.GradientBoostingClassifier(n_estimators=2000) pipeline = sklearn.pipeline.Pipeline([ ('clf', gradient_boosting) ]) pipeline = GridSearchCV(pipeline, cv=5, n_jobs=16, verbose=1, param_grid={ 'clf__n_estimators': [1000, 2000, 3000], 'clf__loss': ['log_loss', 'deviance', 'exponential'], # 'clf__min_samples_split': [2, 10, 40], # 'clf__max_leaf_nodes': [None, 16, 32, 64], 'clf__subsample': [0.75, 1.0], 'clf__max_features': ['sqrt', 0.75, None], 'clf__max_depth': [3, 4, 5], }) pipeline.fit(dataset.train_data, dataset.train_target) for idx, param in enumerate(pipeline.cv_results_['params']): print( 'Rank', pipeline.cv_results_['rank_test_score'][idx], f"Cross-val {pipeline.cv_results_['mean_test_score'][idx] * 100:.1f}%", param ) print('=========') print(pipeline.best_params_) eval_model("Grad", pipeline) # Best models # nejlepší parametry: 'entropy', 100 estimators (ale cv vychází vždy lépe 1000) # podle trainu nerozhodnutelné # 95.9% na testu random_forest = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, criterion="entropy") random_forest.fit(train_data, train_target) eval_model("Final random forest", random_forest) gradient_boosting = sklearn.ensemble.GradientBoostingClassifier(n_estimators=1000, loss='exponential', max_features='sqrt', max_depth=5, subsample=0.75) gradient_boosting.fit(train_data, train_target) eval_model("Final grad boosting", gradient_boosting) if __name__ == "__main__": args = parser.parse_args() main(args)