#!/usr/bin/python3 # Příklad možného autorského řešení. import sys import random def load_dataset(): dataset = [] with open('iris.csv') as f: for line in f: fields = line.split(',') # Skip column headers. if fields[0] == 'sepal_length': continue dataset.append((tuple(map(float, fields[:-1])), fields[-1].strip())) return dataset def augment(t, x): return tuple(list(t) + [x]) class Perceptron: def __init__(self, weights, bias): self.weights = weights self.bias = bias def predict(self, x): assert len(x) == len(self.weights) p = sum(self.weights[i] * x[i] for i in range(len(x))) p += self.bias if p > 0: return 1 else: return -1 def invert_tuple(t): return tuple(map(lambda x: -x, t)) def train_perceptron(positive, negative): p_prime = list(map(lambda t: augment(t, 1), positive)) n_prime = map(lambda t: augment(t, 1), negative) r = p_prime + [invert_tuple(t) for t in n_prime] p = len(r[0]) parameters = [0 for _ in range(p)] # Learning rate. gamma = 0.1 for i in range(10000): misclassified = 0 for sample in r: predict = sum(parameters[i] * sample[i] for i in range(p)) if predict > 0: # Classified OK. continue else: # Misclassified. misclassified += 1 for k in range(p): parameters[k] += sample[k] * gamma if misclassified == 0: break return Perceptron(parameters[:-1], parameters[-1]) def train_recognizer(dataset, category): positive = [x for x, y in dataset if y == category] negative = [x for x, y in dataset if y != category] recognizer = train_perceptron(positive, negative) false_positives = 0 false_negatives = 0 for x in positive: if recognizer.predict(x) != 1: false_positives += 1 for x in negative: if recognizer.predict(x) != -1: false_negatives += 1 print("%s: %d false positives, %d false negatives" % ( category, false_positives, false_negatives )) return recognizer def split_dataset(dataset): # split: 20%, 20%, rest # yeah, ok, I guess I am kind of showing off with needing so little # training data to get this fancy bayesian stuff running alright and # whatever, haters gonna hate random.shuffle(dataset) n_class_train = int(len(dataset) * 0.2) n_dist_train = int(len(dataset) * 0.2) class_train = dataset[:n_class_train] dist_train = dataset[n_class_train:n_class_train+n_dist_train] test = dataset[n_class_train+n_dist_train:] return (class_train, dist_train, test) dataset = load_dataset() classifier_train, distribution_train, test = split_dataset(dataset) classes = ['setosa', 'versicolor', 'virginica'] recognizers = {kind: train_recognizer(classifier_train, kind) for kind in classes} # Použijeme Bayesovo pravidlo a zjednodušující naivní Bayes assumption # nad chováním jednotlivých klasifikátorů. prior = {kind: len([1 for _, y in distribution_train if y == kind]) / len(distribution_train) for kind in classes} # Default value is 1: add-one smoothing counts = {actual: {predictor: {value: 1 for value in [1, -1]} for predictor in classes} for actual in classes} for x, actual in distribution_train: for predictor in classes: prediction = recognizers[predictor].predict(x) counts[actual][predictor][prediction] += 1 def normalize(d): factor = sum(d.values()) return {key: d[key] / factor for key in d} def get_distribution(x): recognizer_signals = {kind: recognizers[kind].predict(x) for kind in classes} distribution = {} for actual in classes: probability = prior[actual] for predicted, value in recognizer_signals.items(): probability *= counts[actual][predicted][value] / sum(counts[actual][predicted].values()) distribution[actual] = probability return normalize(distribution) print() correct = 0 for x, y in test: distribution = get_distribution(x) prediction = max(distribution.keys(), key=lambda x: distribution[x]) print(x, y, prediction, distribution) if y == prediction: correct += 1 print("Training set size:", len(classifier_train) + len(distribution_train)) print("Test set size:", len(test)) print("Test precision: %.2f%%" % (100 * correct / len(test)))