Slides
Slides by Dr. Alex Thomo and Dr. George Tzanetakis
These slides cover basic concepts in big data.
- Summer Institute of Big Data 2013 - Data Mining Slides (G. Tzanetakis) sibd_2013.pdf
- Summer Institute of Big Data 2013 - SQL and Hadoop Slides (A. Thomo) sql_hadoop_sibd2013
Example Python code used for the data mining slides
print(__doc__) import numpy as np import pylab as pl from sklearn import svm # we create 40 separable points np.random.seed(0) X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]] Y = [0] * 20 + [1] * 20 # fit the model clf = svm.SVC(kernel='linear') clf.fit(X, Y) # get the separating hyperplane w = clf.coef_[0] a = -w[0] / w[1] xx = np.linspace(-5, 5) yy = a * xx - (clf.intercept_[0]) / w[1] # plot the parallels to the separating hyperplane that pass through the # support vectors b = clf.support_vectors_[0] yy_down = a * xx + (b[1] - a * b[0]) b = clf.support_vectors_[-1] yy_up = a * xx + (b[1] - a * b[0]) # plot the line, the points, and the nearest vectors to the plane pl.plot(xx, yy, 'k-') pl.plot(xx, yy_down, 'k--') pl.plot(xx, yy_up, 'k--') pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, facecolors='none') pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired) pl.axis('tight') pl.show()
from sklearn import svm from sklearn import neighbors from sklearn import cross_validation from sklearn import naive_bayes from sklearn import linear_model from sklearn import datasets from sklearn import tree from matplotlib.colors import ListedColormap def create_random_data(num_instances, average, deviation): return np.random.randn(num_instances,1) * deviation + average def create_hoops_dataset(): # basketball heights and weigths bheights = create_random_data(100, 190, 10) bweights = create_random_data(100, 110, 8) # normal heights and weights nheights = create_random_data(100, 170, 10) nweights = create_random_data(100, 80, 12) return [bheights, bweights, nheights, nweights] def show_various_plots(bh, bw, nh, nw): figure() subplot(2,2,1) title('Heights') ylabel('Height(cm)'); # plot(bh, linestyle='', marker='b', nh, 'go') plot(bh, linestyle='-', marker='o', color='b') plot(nh, linestyle='-', marker='o', color='g') subplot(2,2,2) title('Weights') ylabel('Weight(gr)'); plot(bw, linestyle='-', marker='o', color='b') plot(nw, linestyle='-', marker='o', color='g') subplot(2,2,3) xlabel('Height(cm)'); hist([bh,nh], 20) subplot(2,2,4) xlabel('Weight(gr)'); hist([bw,nw], 20) def assemble_feature_matrix(bh, bw, nh, nw): # vector of heights for everyone h = np.vstack([bh, nh]) # vector of weights for everyone w = np.vstack([bw, nw]) # vector of class labels used for classes cl = np.array([0] * bh.shape[0] + [1] * nh.shape[0]) return np.hstack([w,h]),cl; def plot_scatter(X,y): # do a scatter plot scatter(X[:, 0], X[:, 1], c=y, cmap=cm.Paired); def plot_scatter_with_classifiers(X,y): # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) classifier_dict = {'KNN1': neighbors.KNeighborsClassifier(1), 'KNN3': neighbors.KNeighborsClassifier(3), 'KNN5': neighbors.KNeighborsClassifier(5), 'SVM': svm.SVC(kernel='linear', C=1), 'NBG': naive_bayes.GaussianNB(), 'DTREE': tree.DecisionTreeClassifier(), 'PRCPTRON': linear_model.SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None) } for clf_name in sorted(classifier_dict): clf = classifier_dict[clf_name] clf.fit(X,y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 hx = (x_max - x_min)/100.0 hy = (y_max - y_min)/100.0 xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) figure() title(clf_name) pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) xlim(xx.min(), xx.max()) ylim(yy.min(), yy.max()) show(); [bh, bw, nh, nw] = create_hoops_dataset() show_various_plots(bh,bw,nh,nw) X, y = assemble_feature_matrix(bh,bw,nh,nw) plot_scatter_with_classifiers(X,y) X,y = datasets.make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) plot_scatter_with_classifiers(X,y) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.4, random_state=0) X_train.shape, y_train.shape # train a svm classifier clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) # predict on test test clf.score(X_test, y_test) # train a nearest neighbor classifier clf = neighbors.KNeighborsClassifier(3).fit(X_train,y_train); clf.score(X_test, y_test) # Do 5 fold cross-validation and calculate the resulting score scores = cross_validation.cross_val_score(clf, X, y, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))