#!/usr/bin/python3 import random import multiprocessing def load_dataset(): dataset = [] with open('wine.csv') as f: for line in f: # Skip column headers. line = line.split(',') if line[0] == '"fixed acidity"': continue data = list(map(float, line)) dataset.append((data[:-1], int(data[-1]))) return dataset def make_normalizer(minimum, maximum): return lambda x: (x - minimum) / (maximum - minimum) def normalize(dataset): normalizers = [] for i in range(len(dataset[0][0])): values = [x[i] for x, _ in dataset] minimum = min(values) maximum = max(values) normalizers.append(make_normalizer(minimum, maximum)) normalize_inputs = lambda x: [normalizers[i](x[i]) for i in range(len(x))] for i in range(len(dataset)): dataset[i] = (normalize_inputs(dataset[i][0]), dataset[i][1]) def split_dataset(dataset): random.shuffle(dataset) # train: 33%, valid: 33%, test: 33% n_train = int(len(dataset) * 0.6) n_valid = int(len(dataset) * 0.2) train = dataset[:n_train] valid = dataset[n_train:n_train+n_valid] test = dataset[n_train+n_valid:] return (train, valid, test) def euclid_distance(a, b): return sum((a[i] - b[i]) ** 2 for i in range(len(a))) class KNNModel: def __init__(self, train, k, distance): self.train = train self.k = k self.distance = distance def get_neighbors(self, x): sample_distance = lambda sample: self.distance(x, sample[0]) return sorted(self.train, key=sample_distance)[:self.k] def predict(self, x): neighbors = self.get_neighbors(x) qualities = [y for x, y in neighbors] return int(round(sum(qualities) / len(qualities))) def get_error(model, dataset): errors = 0 for x, y in dataset: errors += (model.predict(x) - y) ** 2 return errors / len(dataset) dataset = load_dataset() normalize(dataset) train, valid, test = split_dataset(dataset) def get_k_results(k): model = KNNModel(train, k, euclid_distance) train_error = get_error(model, train) valid_error = get_error(model, valid) print(k, "done") return (k, train_error, valid_error) # Optimalizace: paralelizace pool = multiprocessing.Pool() results = pool.map(get_k_results, range(1, 41)) # Alternativa bez optimalizace: # results = map(get_k_results, range(1, 41)) best_k = None best_valid_error = None for k, train_error, valid_error in results: # model = KNNModel(train, k, euclid_distance) # test_error = get_error(model, test) print(k, ": train: %.4f valid: %.4f" % (train_error, valid_error)) if best_valid_error is None or best_valid_error > valid_error: best_k = k best_valid_error = valid_error print("Nejlepší K:", best_k) model = KNNModel(train, k, euclid_distance) test_error = get_error(model, test) print("Chyba na testovací množině (odhad na skut. chybu):", test_error)