In [None]:
%autosave 0
from __future__ import absolute_import, division, print_function

%pylab inline

from matplotlib.colors import ListedColormap
from sklearn import datasets
from scipy import linalg
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

# Exercise 13

### Load samples from iris

In [None]:
iris = datasets.load_iris()
X = iris.data[:, :2] # 4-dimensional data; we use only the first two dimensions
y = iris.target      # labels

In [None]:
classes = set(y)
print('Data set with {0} samples and {1} classes.'.format(len(y), len(classes)))
for cl in classes:
    n = np.count_nonzero(y == cl)
    print('Class {0} occurs {1} times.'.format(cl, n))

We create a data grid for the classification.

In [None]:
stepsize = 0.005
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max + stepsize, stepsize), np.arange(y_min, y_max + stepsize, stepsize))
data = np.c_[xx.ravel(), yy.ravel()]

### Remove samples with the same $x_1$ and $x_2$ coordinate
Sometimes it happens that two points have the same x and y coordinate. We wan't to get rid of those points by dropping them out.

In [None]:
unique_list = list()
X2 = []
y2 = []
for x1, x2, cl in zip(X[:,0], X[:,1], y):
    x1 = int(x1 * 10 + .5) / 10
    x2 = int(x2 * 10 + .5) / 10
    h = x1 + 1j * x2
    if not h in unique_list:
        unique_list.append(h)
        X2.append([x1, x2])
        y2.append(cl)
X = np.array(X2)
y = np.array(y2)

classes = set(y)
print('Data set with {0} samples and {1} classes.'.format(len(y), len(classes)))
for cl in classes:
    n = np.count_nonzero(y == cl)
    print('Class {0} occurs {1} times.'.format(cl, n))

# Classification

The two steps for a classification process are always:

1) Training the data (classifier.fit(Trainingdata,Traininglabels).

2) Prediction (classifier.predict(sample)) -> returns the most likely class label
   
Using the previous code stored the training data in $X$ and the labels in $y$.
We want to create an output that can be plotted later, so we predict on data 

-> clf.predict(data)


This will mark the regions of the output images with a class label.

## Logistic Regression

Implement a Logistic regression classifier. Use the already imported modules.

You can check the accuracy of the prediction by using $metrics.classification\_report(labels, clf.predict(trainingsamples)$

The results of this table are explained here:
https://en.wikipedia.org/wiki/F1_score

In [None]:
LogClf = 

#training step


# make prediction
predLog = 

#print metrics


## Naive Bayes

Create a Naive Bayes (GaussianNB) classifier and test it

In [None]:
gaussianClf = 

#training step

# make prediction
predGauss = 

#print metrics


## Visual Output

In [None]:
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])


In [None]:
figure(figsize(24,6))

plt.subplot(1,3,1)
pcolormesh(xx, yy, predLog.reshape(xx.shape), cmap=cmap_light)
# plot the training samples, too
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(4, 8), plt.ylim(1.75, 4.5)
plt.title("Logistic Regression");

plt.subplot(1,3,2)
pcolormesh(xx, yy, predGauss.reshape(xx.shape), cmap=cmap_light)
# plot the training samples, too
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(4, 8), plt.ylim(1.75, 4.5)
plt.title("Naive Bayes");

## Decision Tree Classifier

Try a decision tree to increase the accuracy

In [None]:
DtClf = 

#training

#prediction
predDt = 

# metrics

## $k$ Nearest Neighbors Classifier ($k$NN)

Implement the k-nearest neighbor classifier with uniform weights and 1 neighbor.

In [None]:
kNNclf = 

#training step

#prediction
Z = 


#metrics

In [None]:
figure(figsize(14,7))

plt.subplot(1,2,1)
pcolormesh(xx, yy, Z.reshape(xx.shape), cmap=cmap_light)

# plot the training samples, too
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(x_min, x_max), plt.ylim(y_min, y_max)
plt.title("NN 3-class classification");

plt.subplot(1,2,2)
pcolormesh(xx, yy, predDt.reshape(xx.shape), cmap=cmap_light)
# plot the training samples, too
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(4, 8), plt.ylim(1.75, 4.5)
plt.title("Decision Tree");

Using a small k makes the classification sensitive to noise. If k is too large, we have many misclassifications!

## Visualization of the classification result for varying numbers of neighbors

Fill the gaps to vary the neighbor numbers

In [None]:
def kNN_classifier(k, X, y, data, shape):
    # create k nearest neighbor classifier
    clf =
    #train data

    #returns reshaped prediction
    return clf.predict(data).reshape(shape)

In [None]:
number_of_neighbors = [1, 5,10, 15,40]
n = int(len(number_of_neighbors) / 2 + 0.5)
figure(figsize(15, 7 * n))
for i, k in enumerate(number_of_neighbors):
    plt.subplot(n, 2, i + 1)
    Z = kNN_classifier(k, X, y, data, xx.shape)
    pcolormesh(xx, yy, Z, cmap=cmap_light)
    
    # plot the training samples, too
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
    plt.xlim(x_min, x_max), plt.ylim(y_min, y_max)
    plt.title("3-class classification (k = {0})".format(k));