Anomaly Detection

The anomaly detection is basically to find the existing pattern and then identify the outliners.

In [1]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [2]:
import math
import scipy.io
import matplotlib.pyplot as plt

def gaussian(v, mu, var):
    return np.exp(- np.power(v - mu, 2) / (2 * var)) / np.sqrt(2 * np.pi * var)

data = scipy.io.loadmat('ex8data1.mat')
mu = np.mean(data['X'], axis=0)
var = np.var(data['X'], axis=0)
print mu, var;
[ 14.11222578  14.99771051] [ 1.83263141  1.70974533]

The next step is to find the $\epsilon$ with best $F1$ scroe. Be aware of _np.logical_and_ may silently bailout if the axes are not alighed appropriately.

In [3]:
z_val = gaussian(data['Xval'][:, 0], mu[0], var[0]) * gaussian(data['Xval'][:, 1], mu[1], var[1])
eps = np.linspace(min(z_val), max(z_val), 1000)
F1 = []

def non_zero(v):
    for i, x in enumerate(v):
        if x:
            yield i

for ep in eps:
    predicted = z_val < ep
    tp = float(np.sum(np.logical_and(predicted, data['yval'].ravel())))
    fp = float(np.sum(np.logical_and(predicted, np.logical_not(data['yval'].ravel()))))
    fn = float(np.sum(np.logical_and(np.logical_not(predicted), data['yval'].ravel())))

    prec = 0 if (tp + fp == 0) else tp / (tp + fp)
    rec = 0 if (tp + fn == 0) else tp / (tp + fn)
    f1 = 0 if prec + rec == 0 else 2 * prec * rec / (prec + rec)
    F1.append(f1)
index = F1.index(max(F1))
ep = eps[index]
print 'ep = %g' % ep

xv, yv = np.meshgrid(np.linspace(0, 25, 100), np.linspace(0, 30, 100))
z = gaussian(xv, mu[0], var[0]) * gaussian(yv, mu[1], var[1])
plt.scatter(data['X'][:, 0], data['X'][:, 1], alpha=0.3);
cs = plt.contour(xv, yv, z, levels=[ep], colors='r')
plt.clabel(cs, inline=1, fontsize=10, fmt='%.2e');
ep = 8.99985e-05