Slides

Slides by Dr. Alex Thomo and Dr. George Tzanetakis

These slides cover basic concepts in big data.

  • Summer Institute of Big Data 2013 - Data Mining Slides (G. Tzanetakis) sibd_2013.pdf
  • Summer Institute of Big Data 2013 - SQL and Hadoop Slides (A. Thomo) sql_hadoop_sibd2013

Example Python code used for the data mining slides

svm.py

print(__doc__)

import numpy as np
import pylab as pl
from sklearn import svm

# we create 40 separable points
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
Y = [0] * 20 + [1] * 20

# fit the model
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)

# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]

# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])

# plot the line, the points, and the nearest vectors to the plane
pl.plot(xx, yy, 'k-')
pl.plot(xx, yy_down, 'k--')
pl.plot(xx, yy_up, 'k--')

pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
           s=80, facecolors='none')
pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)

pl.axis('tight')
pl.show()

create_hoops_dataset.py

from sklearn import svm 
from sklearn import neighbors 
from sklearn import cross_validation
from sklearn import naive_bayes
from sklearn import linear_model
from sklearn import datasets 
from sklearn import tree
from matplotlib.colors import ListedColormap



def create_random_data(num_instances, average, deviation): 
	return np.random.randn(num_instances,1) * deviation + average 

def create_hoops_dataset(): 
	# basketball heights and weigths
	bheights = create_random_data(100, 190, 10)
	bweights = create_random_data(100, 110, 8) 
	# normal heights and weights 
	nheights = create_random_data(100, 170, 10)
	nweights = create_random_data(100, 80, 12)
	return [bheights, bweights, nheights, nweights]


def show_various_plots(bh, bw, nh, nw): 
	figure()
	subplot(2,2,1)
	title('Heights')
	ylabel('Height(cm)');
	# plot(bh, linestyle='', marker='b', nh, 'go')
	plot(bh, linestyle='-', marker='o', color='b')
	plot(nh, linestyle='-', marker='o', color='g')
	subplot(2,2,2)
	title('Weights')
	ylabel('Weight(gr)');
	plot(bw, linestyle='-', marker='o', color='b')
	plot(nw, linestyle='-', marker='o', color='g')
	subplot(2,2,3)
	xlabel('Height(cm)');
	hist([bh,nh], 20)
	subplot(2,2,4)
	xlabel('Weight(gr)');
	hist([bw,nw], 20)
	
def assemble_feature_matrix(bh, bw, nh, nw): 
	# vector of heights for everyone 
	h = np.vstack([bh, nh])
	# vector of weights for everyone 
	w = np.vstack([bw, nw])
	# vector of class labels used for classes  
	cl = np.array([0] * bh.shape[0] + [1] * nh.shape[0])
	return np.hstack([w,h]),cl;
	

def plot_scatter(X,y):
	# do a scatter plot
	scatter(X[:, 0], X[:, 1], c=y, cmap=cm.Paired);	
	

def plot_scatter_with_classifiers(X,y): 
	# Create color maps
	cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
	cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])	

        classifier_dict = {'KNN1': neighbors.KNeighborsClassifier(1), 
                           'KNN3': neighbors.KNeighborsClassifier(3), 
                           'KNN5': neighbors.KNeighborsClassifier(5), 
                           'SVM': svm.SVC(kernel='linear', C=1), 
                           'NBG': naive_bayes.GaussianNB(),
                           'DTREE': tree.DecisionTreeClassifier(),
                           'PRCPTRON': linear_model.SGDClassifier(loss='perceptron', 
                                                                  eta0=1, 
                                                                  learning_rate='constant', 
                                                                  penalty=None)
        }
        
        for clf_name in sorted(classifier_dict): 
                clf = classifier_dict[clf_name]
                clf.fit(X,y)
                # Plot the decision boundary. For that, we will assign a color to each
                # point in the mesh [x_min, m_max]x[y_min, y_max].
                x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
                y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
                hx = (x_max - x_min)/100.0
                hy = (y_max - y_min)/100.0
                xx, yy = np.meshgrid(np.arange(x_min, x_max, hx),
                                     np.arange(y_min, y_max, hy))
                Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
                
                # Put the result into a color plot
                Z = Z.reshape(xx.shape)
                figure()
                title(clf_name)
                pcolormesh(xx, yy, Z, cmap=cmap_light)
	
                # Plot also the training points
                scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
                xlim(xx.min(), xx.max())
                ylim(yy.min(), yy.max())
                show();



[bh, bw, nh, nw] = create_hoops_dataset() 
show_various_plots(bh,bw,nh,nw)
X, y = assemble_feature_matrix(bh,bw,nh,nw)
plot_scatter_with_classifiers(X,y)

X,y = datasets.make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)
plot_scatter_with_classifiers(X,y)



X_train, X_test, y_train, y_test =  cross_validation.train_test_split(X,y,test_size=0.4, random_state=0)
X_train.shape, y_train.shape      

# train a svm classifier 
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

# predict on test test 
clf.score(X_test, y_test)

# train a nearest neighbor classifier 
clf = neighbors.KNeighborsClassifier(3).fit(X_train,y_train);
clf.score(X_test, y_test)

# Do 5 fold cross-validation and calculate the resulting score 
scores = cross_validation.cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))