Wednesday, June 8, 2016

Python ML 12 - Training Artificial Neural Networks for Image Recognition

-- Modeling complex functions with artificial neural networks

Single-layer neural network recap

Introducing the multi-layer neural network architecture

Activating a neural network via forward propagation


-- Classifying handwritten digits

Obtaining the MNIST dataset

gzip *ubyte.gz -d


import os
import struct
import numpy as np

def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte'
                                % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte'
                               % kind)
       
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II',
                                 lbpath.read(8))
        labels = np.fromfile(lbpath,
                             dtype=np.uint8)

    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack(">IIII",
                                               imgpath.read(16))
        images = np.fromfile(imgpath,
                    dtype=np.uint8).reshape(len(labels), 784)

    return images, labels

magic, n = struct.unpack('>II', lbpath.read(8))
labels = np.fromfile(lbpath, dtype=np.int8)

X_train, y_train = load_mnist('mnist', kind='train')
print('Rows: %d, columns: %d'
        % (X_train.shape[0], X_train.shape[1]))


X_test, y_test = load_mnist('mnist', kind='t10k')
print('Rows: %d, columns: %d'
        % (X_test.shape[0], X_test.shape[1]))

import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows=2, ncols=5, sharex=True, sharey=True,)
ax = ax.flatten()
for i in range(10):
    img = X_train[y_train == i][0].reshape(28, 28)
    ax[i].imshow(img, cmap='Greys', interpolation='nearest')
ax[0].set_xticks([])
ax[0].set_yticks([])
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(nrows=5,
                        ncols=5,
                        sharex=True,
                        sharey=True,)
ax = ax.flatten()
for i in range(25):
     img = X_train[y_train == 7][i].reshape(28, 28)
     ax[i].imshow(img, cmap='Greys', interpolation='nearest')
ax[0].set_xticks([])
ax[0].set_yticks([])
plt.tight_layout()
plt.show()

np.savetxt('train_img.csv', X_train,
            fmt='%i', delimiter=',')
np.savetxt('train_labels.csv', y_train,
            fmt='%i', delimiter=',')
np.savetxt('test_img.csv', X_test,
            fmt='%i', delimiter=',')
np.savetxt('test_labels.csv', y_test,
            fmt='%i', delimiter=',')

Implementing a multi-layer perceptron

from neuralnet import NeuralNetMLP

import numpy as np
from scipy.special import expit
import sys

class NeuralNetMLP(object):
    def __init__(self, n_output, n_features, n_hidden=30,
                 l1=0.0, l2=0.0, epochs=500, eta=0.001,
                 alpha=0.0, decrease_const=0.0, shuffle=True,
                 minibatches=1, random_state=None):
        np.random.seed(random_state)
        self.n_output = n_output
        self.n_features = n_features
        self.n_hidden = n_hidden
        self.w1, self.w2 = self._initialize_weights()
        self.l1 = l1
        self.l2 = l2
        self.epochs = epochs
        self.eta = eta
        self.alpha = alpha
        self.decrease_const = decrease_const
        self.shuffle = shuffle
        self.minibatches = minibatches

    def _encode_labels(self, y, k):
        onehot = np.zeros((k, y.shape[0]))
        for idx, val in enumerate(y):
            onehot[val, idx] = 1.0
        return onehot

    def _initialize_weights(self):
        w1 = np.random.uniform(-1.0, 1.0,      
                     size=self.n_hidden*(self.n_features + 1))
        w1 = w1.reshape(self.n_hidden, self.n_features + 1)
        w2 = np.random.uniform(-1.0, 1.0,
                     size=self.n_output*(self.n_hidden + 1))
        w2 = w2.reshape(self.n_output, self.n_hidden + 1)
        return w1, w2

    def _sigmoid(self, z):
        # expit is equivalent to 1.0/(1.0 + np.exp(-z))
        return expit(z)

    def _sigmoid_gradient(self, z):
        sg = self._sigmoid(z)
        return sg * (1 - sg)

    def _add_bias_unit(self, X, how='column'):
        if how == 'column':
            X_new = np.ones((X.shape[0], X.shape[1]+1))
            X_new[:, 1:] = X
        elif how == 'row':
            X_new = np.ones((X.shape[0]+1, X.shape[1]))
            X_new[1:, :] = X
        else:
            raise AttributeError('`how` must be `column` or `row`')
        return X_new

    def _feedforward(self, X, w1, w2):
        a1 = self._add_bias_unit(X, how='column')
        z2 = w1.dot(a1.T)
        a2 = self._sigmoid(z2)
        a2 = self._add_bias_unit(a2, how='row')
        z3 = w2.dot(a2)
        a3 = self._sigmoid(z3)
        return a1, z2, a2, z3, a3

    def _L2_reg(self, lambda_, w1, w2):
        return (lambda_/2.0) * (np.sum(w1[:, 1:] ** 2)\
                + np.sum(w2[:, 1:] ** 2))

    def _L1_reg(self, lambda_, w1, w2):
        return (lambda_/2.0) * (np.abs(w1[:, 1:]).sum()\
                + np.abs(w2[:, 1:]).sum())

    def _get_cost(self, y_enc, output, w1, w2):
        term1 = -y_enc * (np.log(output))
        term2 = (1 - y_enc) * np.log(1 - output)
        cost = np.sum(term1 - term2)
        L1_term = self._L1_reg(self.l1, w1, w2)
        L2_term = self._L2_reg(self.l2, w1, w2)
        cost = cost + L1_term + L2_term
        return cost

    def _get_gradient(self, a1, a2, a3, z2, y_enc, w1, w2):
        # backpropagation
        sigma3 = a3 - y_enc
        z2 = self._add_bias_unit(z2, how='row')
        sigma2 = w2.T.dot(sigma3) * self._sigmoid_gradient(z2)
        sigma2 = sigma2[1:, :]
        grad1 = sigma2.dot(a1)
        grad2 = sigma3.dot(a2.T)

        # regularize
        grad1[:, 1:] += (w1[:, 1:] * (self.l1 + self.l2))
        grad2[:, 1:] += (w2[:, 1:] * (self.l1 + self.l2))

        return grad1, grad2

    def predict(self, X):
        a1, z2, a2, z3, a3 = self._feedforward(X, self.w1, self.w2)
        y_pred = np.argmax(z3, axis=0)
        return y_pred

    def fit(self, X, y, print_progress=False):
        self.cost_ = []
        X_data, y_data = X.copy(), y.copy()
        y_enc = self._encode_labels(y, self.n_output)

        delta_w1_prev = np.zeros(self.w1.shape)
        delta_w2_prev = np.zeros(self.w2.shape)

        for i in range(self.epochs):

            # adaptive learning rate
            self.eta /= (1 + self.decrease_const*i)

            if print_progress:
                sys.stderr.write(
                        '\rEpoch: %d/%d' % (i+1, self.epochs))
                sys.stderr.flush()

            if self.shuffle:
                idx = np.random.permutation(y_data.shape[0])
                X_data, y_data = X_data[idx], y_data[idx]

            mini = np.array_split(range(
                         y_data.shape[0]), self.minibatches)
            for idx in mini:

                # feedforward
                a1, z2, a2, z3, a3 = self._feedforward(
                                     X[idx], self.w1, self.w2)
                cost = self._get_cost(y_enc=y_enc[:, idx],
                                      output=a3,
                                      w1=self.w1,
                                      w2=self.w2)
                self.cost_.append(cost)

                # compute gradient via backpropagation
                grad1, grad2 = self._get_gradient(a1=a1, a2=a2,
                                            a3=a3, z2=z2,
                                            y_enc=y_enc[:, idx],
                                            w1=self.w1,
                                            w2=self.w2)

                # update weights
                delta_w1, delta_w2 = self.eta * grad1,\
                                     self.eta * grad2
                self.w1 -= (delta_w1 + (self.alpha * delta_w1_prev))
                self.w2 -= (delta_w2 + (self.alpha * delta_w2_prev))
                delta_w1_prev, delta_w2_prev = delta_w1, delta_w2

        return self

nn = NeuralNetMLP(n_output=10,
                   n_features=X_train.shape[1],
                   n_hidden=50,
                   l2=0.1,
                   l1=0.0,
                   epochs=1000,
                   eta=0.001,
                   alpha=0.001,
                   decrease_const=0.00001,
                   shuffle=True,
                   minibatches=50,
                   random_state=1)

nn.fit(X_train, y_train, print_progress=True)


plt.plot(range(len(nn.cost_)), nn.cost_)
plt.ylim([0, 2000])
plt.ylabel('Cost')
plt.xlabel('Epochs * 50')
plt.tight_layout()
plt.show()

batches = np.array_split(range(len(nn.cost_)), 1000)
cost_ary = np.array(nn.cost_)
cost_avgs = [np.mean(cost_ary[i]) for i in batches]

plt.plot(range(len(cost_avgs)),
          cost_avgs,
          color='red')
plt.ylim([0, 2000])
plt.ylabel('Cost')
plt.xlabel('Epochs')
plt.tight_layout()
plt.show()

y_train_pred = nn.predict(X_train)
acc = np.sum(y_train == y_train_pred, axis=0) / X_train.shape[0]
print('Training accuracy: %.2f%%' % (acc * 100))

y_test_pred = nn.predict(X_test)
acc = np.sum(y_test == y_test_pred, axis=0) / X_test.shape[0]
print('Training accuracy: %.2f%%' % (acc * 100))

miscl_img = X_test[y_test != y_test_pred][:25]
correct_lab = y_test[y_test != y_test_pred][:25]
miscl_lab= y_test_pred[y_test != y_test_pred][:25]

fig, ax = plt.subplots(nrows=5,
                        ncols=5,
                        sharex=True,
                        sharey=True,)
ax = ax.flatten()
for i in range(25):
     img = miscl_img[i].reshape(28, 28)
     ax[i].imshow(img,
                  cmap='Greys',
                  interpolation='nearest')
     ax[i].set_title('%d) t: %d p: %d'
                     % (i+1, correct_lab[i], miscl_lab[i]))
ax[0].set_xticks([])
ax[0].set_yticks([])
plt.tight_layout()
plt.show()

-- Training an artificial neural network

Computing the logistic cost function

Training neural networks via backpropagation

-- Developing your intuition for backpropagation

-- Debugging neural networks with gradient checking

def _gradient_checking(self, X, y_enc, w1,
                       w2, epsilon, grad1, grad2):
    """ Apply gradient checking (for debugging only)

    Returns
    ---------
    relative_error : float
      Relative error between the numerically
      approximated gradients and the backpropagated gradients.

    """
    num_grad1 = np.zeros(np.shape(w1))
    epsilon_ary1 = np.zeros(np.shape(w1))
    for i in range(w1.shape[0]):
        for j in range(w1.shape[1]):
            epsilon_ary1[i, j] = epsilon
            a1, z2, a2, z3, a3 = self._feedforward(
                                           X,
                                           w1 - epsilon_ary1,
                                           w2)
            cost1 = self._get_cost(y_enc,
                                   a3,
                                   w1-epsilon_ary1,
                                   w2)
            a1, z2, a2, z3, a3 = self._feedforward(
                                         X,
                                         w1 + epsilon_ary1,
                                         w2)
            cost2 = self._get_cost(y_enc,
                                   a3,
                                   w1 + epsilon_ary1,
                                   w2)
            num_grad1[i, j] = (cost2 - cost1) / (2 * epsilon)
            epsilon_ary1[i, j] = 0

    num_grad2 = np.zeros(np.shape(w2))
    epsilon_ary2 = np.zeros(np.shape(w2))
    for i in range(w2.shape[0]):
        for j in range(w2.shape[1]):
            epsilon_ary2[i, j] = epsilon
            a1, z2, a2, z3, a3 = self._feedforward(
                                            X,
                                            w1,
                                            w2 - epsilon_ary2)
            cost1 = self._get_cost(y_enc,
                                   a3,
                                   w1,
                                   w2 - epsilon_ary2)
            a1, z2, a2, z3, a3 = self._feedforward(
                                          X,
                                          w1,
                                          w2 + epsilon_ary2)
            cost2 = self._get_cost(y_enc,
                                   a3,
                                   w1,
                                   w2 + epsilon_ary2)


            num_grad2[i, j] = (cost2 - cost1) / (2 * epsilon)
            epsilon_ary2[i, j] = 0

    num_grad = np.hstack((num_grad1.flatten(),
                          num_grad2.flatten()))
    grad = np.hstack((grad1.flatten(), grad2.flatten()))
    norm1 = np.linalg.norm(num_grad - grad)
    norm2 = np.linalg.norm(num_grad)
    norm3 = np.linalg.norm(grad)
    relative_error = norm1 / (norm2 + norm3)
    return relative_error

class MLPGradientCheck(object):
    [...]
    def fit(self, X, y, print_progress=False):
        [...]
                # compute gradient via backpropagation
                grad1, grad2 = self._get_gradient(
                                       a1=a1,
                                       a2=a2,
                                       a3=a3,
                                       z2=z2,
                                       y_enc=y_enc[:, idx],
                                       w1=self.w1,
                                       w2=self.w2)
               
                ## start gradient checking

                grad_diff = self._gradient_checking(
                                     X=X[idx],
                                     y_enc=y_enc[:, idx],
                                     w1=self.w1,
                                     w2=self.w2,
                                     epsilon=1e-5,
                                     grad1=grad1,
                                     grad2=grad2)
                if grad_diff <= 1e-7:
                    print('Ok: %s' % grad_diff)
                elif grad_diff <= 1e-4:
                    print('Warning: %s' % grad_diff)
                else:
                    print('PROBLEM: %s' % grad_diff)
             
                ## end gradient checking
               
                # update weights; [alpha * delta_w_prev]
                # for momentum learning
                delta_w1 = self.eta * grad1
                delta_w2 = self.eta * grad2
                self.w1 -= (delta_w1 +\
                           (self.alpha * delta_w1_prev))
                self.w2 -= (delta_w2 +\
                           (self.alpha * delta_w2_prev))
                delta_w1_prev = delta_w1
                delta_w2_prev = delta_w2

        return self

nn_check = MLPGradientCheck(n_output=10,
                                n_features=X_train.shape[1],
                                n_hidden=10,
                                l2=0.0,
                                l1=0.0,
                                epochs=10,
                                eta=0.001,
                                alpha=0.0,
                                decrease_const=0.0,
                                minibatches=1,
                                random_state=1)

nn_check.fit(X_train[:5], y_train[:5], print_progress=False)


-- Convergence in neural networks

-- Other neural network architectures

Convolutional Neural Networks

Recurrent Neural Networks

A few last words about neural network implementation



Monday, June 6, 2016

Python ML 11 - Working with Unlabeled Data - Clustering Analysis

-- Grouping objects by similarity using k-means

from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
                   n_features=2,
                   centers=3,
                   cluster_std=0.5,
                   shuffle=True,
                   random_state=0)

import matplotlib.pyplot as plt
plt.scatter(X[:,0],
            X[:,1],
            c='white',
            marker='o',
            s=50)
plt.grid()
plt.show()


from sklearn.cluster import KMeans
km = KMeans(n_clusters=3,
             init='random',
             n_init=10,
             max_iter=300,
             tol=1e-04,
             random_state=0)
y_km = km.fit_predict(X)


K-means++

plt.scatter(X[y_km==0,0],
             X[y_km ==0,1],
             s=50,
             c='lightgreen',
             marker='s',
             label='cluster 1')
plt.scatter(X[y_km ==1,0],
             X[y_km ==1,1],
             s=50,
             c='orange',
             marker='o',
             label='cluster 2')
plt.scatter(X[y_km ==2,0],
             X[y_km ==2,1],
             s=50,
             c='lightblue',
             marker='v',
             label='cluster 3')
plt.scatter(km.cluster_centers_[:,0],
             km.cluster_centers_[:,1],
             s=250,
             marker='*',
             c='red',
             label='centroids')
plt.legend()
plt.grid()
plt.show()

Hard versus soft clustering

Hard clustering describes a family of algorithms where each sample in a dataset is assigned to exactly one cluster, as in the k-means algorithm that we discussed in the previous subsection.
In contrast, algorithms for soft clustering (sometimes also called fuzzy clustering) assign a sample to one or more clusters. A popular example of soft clustering is the fuzzy C-means (FCM) algorithm (also called soft k-means or fuzzy k-means).

Using the elbow method to find the optimal number of clusters

print('Distortion: %.2f' % km.inertia_)

distortions = []
for i in range(1, 11):
     km = KMeans(n_clusters=i,
                 init='k-means++',
                 n_init=10,
                 max_iter=300,
                 random_state=0)
    km.fit(X)
     distortions.append(km.inertia_)
plt.plot(range(1,11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

-- Quantifying the quality of clustering via silhouette plots

km = KMeans(n_clusters=3,
             init='k-means++',
             n_init=10,
             max_iter=300,
             tol=1e-04,
             random_state=0)
y_km = km.fit_predict(X)

import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X,
                                      y_km,
                                      metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
     c_silhouette_vals = silhouette_vals[y_km == c]
     c_silhouette_vals.sort()
     y_ax_upper += len(c_silhouette_vals)
     color = cm.jet(i / n_clusters)
     plt.barh(range(y_ax_lower, y_ax_upper),
              c_silhouette_vals,
              height=1.0,
              edgecolor='none',
              color=color)
     yticks.append((y_ax_lower + y_ax_upper) / 2)
     y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
             color="red",
             linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()

km = KMeans(n_clusters=2,
             init='k-means++',
             n_init=10,
             max_iter=300,
             tol=1e-04,
             random_state=0)
y_km = km.fit_predict(X)

plt.scatter(X[y_km==0,0],
             X[y_km==0,1],
             s=50, c='lightgreen',
             marker='s',
             label='cluster 1')
plt.scatter(X[y_km==1,0],
             X[y_km==1,1],
             s=50,
             c='orange',
             marker='o',
             label='cluster 2')
plt.scatter(km.cluster_centers_[:,0],
             km.cluster_centers_[:,1],
             s=250,
             marker='*',
             c='red',
             label='centroids')
plt.legend()
plt.grid()
plt.show()


cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X,
                                      y_km,
                                      metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
     c_silhouette_vals = silhouette_vals[y_km == c]
     c_silhouette_vals.sort()
     y_ax_upper += len(c_silhouette_vals)
     color = cm.jet(i / n_clusters)
     plt.barh(range(y_ax_lower, y_ax_upper),
              c_silhouette_vals,
              height=1.0,
              edgecolor='none',
              color=color)
     yticks.append((y_ax_lower + y_ax_upper) / 2)
     y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()

-- Organizing clusters as a hierarchical tree

import pandas as pd
import numpy as np
np.random.seed(123)
variables = ['X', 'Y', 'Z']
labels = ['ID_0','ID_1','ID_2','ID_3','ID_4']
X = np.random.random_sample([5,3])*10
df = pd.DataFrame(X, columns=variables, index=labels)
df

Performing hierarchical clustering on a distance matrix

from scipy.spatial.distance import pdist, squareform
row_dist = pd.DataFrame(squareform(
            pdist(df, metric='euclidean')),
            columns=labels, index=labels)
row_dist

from scipy.cluster.hierarchy import linkage
help(linkage)

row_clusters = linkage(pdist(df, metric='euclidean'),
                        method='complete')

row_clusters = linkage(df.values,
                        method='complete',
                        metric='euclidean')

pd.DataFrame(row_clusters,
      columns=['row label 1',
               'row label 2',
               'distance',
               'no. of items in clust.'],
      index=['cluster %d' %(i+1) for i in
             range(row_clusters.shape[0])])

from scipy.cluster.hierarchy import dendrogram
row_dendr = dendrogram(row_clusters,
                       labels=labels,
                       # make dendrogram black (part 2/2)
                       # color_threshold=np.inf
                       )
plt.tight_layout()
plt.ylabel('Euclidean distance')
plt.show()


Attaching dendrograms to a heat map


fig = plt.figure(figsize=(8,8))
axd = fig.add_axes([0.09,0.1,0.2,0.6])
row_dendr = dendrogram(row_clusters, orientation='right')

df_rowclust = df.ix[row_dendr['leaves'][::-1]]

axm = fig.add_axes([0.23,0.1,0.6,0.6])
cax = axm.matshow(df_rowclust,
              interpolation='nearest', cmap='hot_r')

axd.set_xticks([])
axd.set_yticks([])
for i in axd.spines.values():
     i.set_visible(False)
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))
plt.show()

Applying agglomerative clustering via scikit-learn

from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=2,
                              affinity='euclidean',
                              linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)

-- Locating regions of high density via DBSCAN

from sklearn.datasets import make_moons
X, y = make_moons(n_samples=200,
                   noise=0.05,
                   random_state=0)
plt.scatter(X[:,0], X[:,1])
plt.show()


f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,3))
km = KMeans(n_clusters=2,
             random_state=0)
y_km = km.fit_predict(X)
ax1.scatter(X[y_km==0,0],
             X[y_km==0,1],
             c='lightblue',
             marker='o',
             s=40,
             label='cluster 1')
ax1.scatter(X[y_km==1,0],
             X[y_km==1,1],
             c='red',
             marker='s',
             s=40,
             label='cluster 2')
ax1.set_title('K-means clustering')
ac = AgglomerativeClustering(n_clusters=2,
                              affinity='euclidean',
                              linkage='complete')
y_ac = ac.fit_predict(X)
ax2.scatter(X[y_ac==0,0],
             X[y_ac==0,1],
             c='lightblue',
             marker='o',
             s=40,
             label='cluster 1')
ax2.scatter(X[y_ac==1,0],
             X[y_ac==1,1],
             c='red',
             marker='s',
             s=40,
             label='cluster 2')
ax2.set_title('Agglomerative clustering')
plt.legend()
plt.show()


from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.2,
             min_samples=5,
             metric='euclidean')
y_db = db.fit_predict(X)
plt.scatter(X[y_db==0,0],
             X[y_db==0,1],
             c='lightblue',
             marker='o',
             s=40,
             label='cluster 1')
plt.scatter(X[y_db==1,0],
             X[y_db==1,1],
             c='red',
             marker='s',
             s=40,
             label='cluster 2')
plt.legend()
plt.show()

Friday, June 3, 2016

Python ML 10 - Predicting a Continuous Target Variables with Regression Analysis

-- Introducing a simple linear regression model

-- Exploring the House Dataset

import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data',
                 header=None, sep='\s+')
df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
              'NOX', 'RM', 'AGE', 'DIS', 'RAD',
              'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df.head()

Visualizing the important characteristics of a dataset

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
sns.pairplot(df[cols], size=2.5);
plt.show()

import numpy as np
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,
            cbar=True,
            annot=True,
            square=True,
            fmt='.2f',
            annot_kws={'size': 15},
            yticklabels=cols,
            xticklabels=cols)
plt.show()

-- Implementing an ordinary least squares linear regression model

Solving regression for regression parameters with gradient descent

class LinearRegressionGD(object):

    def __init__(self, eta=0.001, n_iter=20):
        self.eta = eta
        self.n_iter = n_iter

    def fit(self, X, y):
        self.w_ = np.zeros(1 + X.shape[1])
        self.cost_ = []

        for i in range(self.n_iter):
            output = self.net_input(X)
            errors = (y - output)
            self.w_[1:] += self.eta * X.T.dot(errors)
            self.w_[0] += self.eta * errors.sum()
            cost = (errors**2).sum() / 2.0
            self.cost_.append(cost)
        return self

    def net_input(self, X):
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def predict(self, X):
        return self.net_input(X)

X = df[['RM']].values
y = df['MEDV'].values
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)
lr = LinearRegressionGD()
lr.fit(X_std, y_std)

plt.plot(range(1, lr.n_iter+1), lr.cost_)
plt.ylabel('SSE')
plt.xlabel('Epoch')
plt.show()

def lin_regplot(X, y, model):
     plt.scatter(X, y, c='blue')
     plt.plot(X, model.predict(X), color='red')  
     return None

lin_regplot(X_std, y_std, lr)
plt.xlabel('Average number of rooms [RM] (standardized)')
plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
plt.show()

num_rooms_std = sc_x.transform([5.0])
price_std = lr.predict(num_rooms_std)
print("Price in $1000's: %.3f" % \
       sc_y.inverse_transform(price_std))

print('Slope: %.3f' % lr.w_[1])
print('Intercept: %.3f' % lr.w_[0])

Estimating the coefficient of a regression model via scikit-learn

from sklearn.linear_model import LinearRegression
slr = LinearRegression()
slr.fit(X, y)
print('Slope: %.3f' % slr.coef_[0])
print('Intercept: %.3f' % slr.intercept_)

-- Fitting a robust regression model using RANSAC

from sklearn.linear_model import RANSACRegressor
ransac = RANSACRegressor(LinearRegression(),
            max_trials=100,
            min_samples=50,
            residual_metric=lambda x: np.sum(np.abs(x), axis=1),
            residual_threshold=5.0,
            random_state=0)
ransac.fit(X, y)

inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
plt.scatter(X[inlier_mask], y[inlier_mask],
             c='blue', marker='o', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask],
             c='lightgreen', marker='s', label='Outliers')
plt.plot(line_X, line_y_ransac, color='red')
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $1000\'s [MEDV]')
plt.legend(loc='upper left')
plt.show()

print('Slope: %.3f' % ransac.estimator_.coef_[0])
print('Intercept: %.3f' % ransac.estimator_.intercept_)

-- Evaluating the performance of linear regression models

from sklearn.cross_validation import train_test_split
X = df.iloc[:, :-1].values
y = df['MEDV'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
slr = LinearRegression()
slr.fit(X_train, y_train)
y_train_pred = slr.predict(X_train)
y_test_pred = slr.predict(X_test)


plt.scatter(y_train_pred, y_train_pred - y_train,
             c='blue', marker='o', label='Training data')
plt.scatter(y_test_pred,  y_test_pred - y_test,
             c='lightgreen', marker='s', label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
plt.xlim([-10, 50])
plt.show()

from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))


from sklearn.metrics import r2_score
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

-- Using regularized methods for regression

from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1.0)

from sklearn.linear_model import ElasticNet
lasso = ElasticNet(alpha=1.0, l1_ratio=0.5)


-- Turning a linear regression model into a curve ñ polynomial regression

from sklearn.preprocessing import PolynomialFeatures
X = np.array([258.0, 270.0, 294.0,
              320.0, 342.0, 368.0,
              396.0, 446.0, 480.0,
              586.0])[:, np.newaxis]

y = np.array([236.4, 234.4, 252.8,
              298.6, 314.2, 342.2,
              360.8, 368.0, 391.2,
              390.8])
lr = LinearRegression()
pr = LinearRegression()
quadratic = PolynomialFeatures(degree=2)
X_quad = quadratic.fit_transform(X)

lr.fit(X, y)
X_fit = np.arange(250,600,10)[:, np.newaxis]
y_lin_fit = lr.predict(X_fit)

pr.fit(X_quad, y)
y_quad_fit = pr.predict(quadratic.fit_transform(X_fit))
plt.scatter(X, y, label='training points')
plt.plot(X_fit, y_lin_fit,
          label='linear fit', linestyle='--')
plt.plot(X_fit, y_quad_fit,
          label='quadratic fit')
plt.legend(loc='upper left')
plt.show()

y_lin_pred = lr.predict(X)
y_quad_pred = pr.predict(X_quad)
print('Training MSE linear: %.3f, quadratic: %.3f' % (
         mean_squared_error(y, y_lin_pred),
         mean_squared_error(y, y_quad_pred)))
print('Training  R^2 linear: %.3f, quadratic: %.3f' % (
         r2_score(y, y_lin_pred),
         r2_score(y, y_quad_pred)))

Modeling nonlinear relationships in the Housing Dataset

X = df[['LSTAT']].values
y = df['MEDV'].values
regr = LinearRegression()

# create polynomial features
quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X)
X_cubic = cubic.fit_transform(X)

# linear fit
X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis]
regr = regr.fit(X, y)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y, regr.predict(X))

# quadratic fit
regr = regr.fit(X_quad, y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y, regr.predict(X_quad))

# cubic fit
regr = regr.fit(X_cubic, y)
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
cubic_r2 = r2_score(y, regr.predict(X_cubic))

# plot results
plt.scatter(X, y,
             label='training points',
             color='lightgray')
plt.plot(X_fit, y_lin_fit,
          label='linear (d=1), $R^2=%.2f$'
            % linear_r2,
          color='blue',
          lw=2,
          linestyle=':')
plt.plot(X_fit, y_quad_fit,
          label='quadratic (d=2), $R^2=%.2f$'
            % quadratic_r2,
          color='red',
          lw=2,
          linestyle='-')
plt.plot(X_fit, y_cubic_fit,
          label='cubic (d=3), $R^2=%.2f$'
            % cubic_r2,
          color='green',
          lw=2,
          linestyle='--')
plt.xlabel('% lower status of the population [LSTAT]')
plt.ylabel('Price in $1000\'s [MEDV]')
plt.legend(loc='upper right')
plt.show()

Dealing with nonlinear relationships using random forests
# transform features
X_log = np.log(X)
y_sqrt = np.sqrt(y)

# fit features
X_fit = np.arange(X_log.min()-1,
                   X_log.max()+1, 1)[:, np.newaxis]
regr = regr.fit(X_log, y_sqrt)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y_sqrt, regr.predict(X_log))

# plot results
plt.scatter(X_log, y_sqrt,
             label='training points',
             color='lightgray')
plt.plot(X_fit, y_lin_fit,
          label='linear (d=1), $R^2=%.2f$' % linear_r2,
          color='blue',
          lw=2)
plt.xlabel('log(% lower status of the population [LSTAT])')
plt.ylabel('$\sqrt{Price \; in \; \$1000\'s [MEDV]}$')
plt.legend(loc='lower left')
plt.show()


Decision tree regression
from sklearn.tree import DecisionTreeRegressor
X = df[['LSTAT']].values
y = df['MEDV'].values
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X, y)
sort_idx = X.flatten().argsort()
 lin_regplot(X[sort_idx], y[sort_idx], tree)
plt.xlabel('% lower status of the population [LSTAT]')
plt.ylabel('Price in $1000\'s [MEDV]')
plt.show()


Random forest regression
X = df.iloc[:, :-1].values
y = df['MEDV'].values
X_train, X_test, y_train, y_test =\
       train_test_split(X, y,
                        test_size=0.4,
                        random_state=1)

from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(                            
                                n_estimators=1000,
                                criterion='mse',
                                random_state=1,
                                n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))


plt.scatter(y_train_pred,
             y_train_pred - y_train,
             c='black',
             marker='o',
             s=35,
             alpha=0.5,
             label='Training data')
plt.scatter(y_test_pred,
             y_test_pred - y_test,
             c='lightgreen',
             marker='s',
             s=35,
             alpha=0.7,
             label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
plt.xlim([-10, 50])
plt.show()

Python ML 9 -

Wednesday, June 1, 2016

Python ML 8 - Applying Machine Learning to Sentiment Analysis

-- Obtaining the IMDb movie review dataset

import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path ='./aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
           df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)

df = pd.read_csv('./movie_data.csv')
df.head(3)

-- Introducing the bag-of-words model

Transforming words into feature vectors

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

print(count.vocabulary_)

print(bag.toarray())


Assessing word relevancy via term frequency-inverse document frequency

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())


Cleaning text data

df.loc[0, 'review'][-50:]
'is seven.<br /><br />Title (Brazil): Not Available'

import re
def preprocessor(text):
     text = re.sub('<[^>]*>', '', text)
     emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
     text = re.sub('[\W]+', ' ', text.lower()) + \ '.join(emoticons).replace('-', '')
     return text

preprocessor(df.loc[0, 'review'][-50:])
'is seven title brazil not available'
preprocessor("</a>This :) is :( a test :-)!")
'this is a test :) :( :)'
df['review'] = df['review'].apply(preprocessor)

Processing documents into tokens

def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

-- Training a logistic regression model for document classification

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values


from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                         lowercase=False,
                         preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                   tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                   tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf',
                      LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=1,
                           n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

print('CV Accuracy: %.3f'
       % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f'
     % clf.score(X_test, y_test))

-- Working with bigger data - online algorithms and out-of-core learning

import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
     text = re.sub('<[^>]*>', '', text)
     emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                            text.lower())
     text = re.sub('[\W]+', ' ', text.lower()) \
            + ' '.join(emoticons).replace('-', '')
     tokenized = [w for w in text.split() if w not in stop]
     return tokenized

def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

next(stream_docs(path='./movie_data.csv'))

def get_minibatch(doc_stream, size):
     docs, y = [], []
         try:
             for _ in range(size):
                 text, label = next(doc_stream)
                 docs.append(text)
                 y.append(label)
         except StopIteration:
             return None, None
         return docs, y

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
                          n_features=2**21,
                          preprocessor=None,
                          tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
     X_train, y_train = get_minibatch(doc_stream, size=1000)
     if not X_train:
         break
     X_train = vect.transform(X_train)
     clf.partial_fit(X_train, y_train, classes=classes)
     pbar.update()

X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))