閱讀文章 - 精華區 NTUCH-HW

import pandas as pd import numpy as np import seaborn as sns from sklearn.preprocessing import LabelEncoder import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.metrics import confusion_matrix df = pd.read_csv('voice.csv') X=df.iloc[:, :-1] X1=df.drop(df.columns[[0,2,3,20]],axis=1) Y=df.iloc[:, -1:] gender_encoder = LabelEncoder() y = gender_encoder.fit_transform(Y) scaler = StandardScaler() scaler.fit(X1) X = scaler.transform(X1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) svc.fit(X_train,y_train) y_pred=svc.predict(X_train) print('Accuracy Score:') print(metrics.accuracy_score(y_train,y_pred)) a=confusion_matrix(y_pred1, y_test) plot_confusion_matrix(cm = a, normalize = False, target_names = ['0', '1',"2","3","4","5","6"], title = "Confusion Matrix") def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=None, normalize=True): import matplotlib.pyplot as plt import numpy as np import itertools accuracy = np.trace(cm) / float(np.sum(cm)) misclass = 1 - accuracy if cmap is None: cmap = plt.get_cmap('Blues') plt.figure(figsize=(4, 3)) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() if target_names is not None: tick_marks = np.arange(len(target_names)) plt.xticks(tick_marks, target_names, rotation=45) plt.yticks(tick_marks, target_names) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] thresh = cm.max() / 12 if normalize else cm.max() / 12 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): if normalize: plt.text(j, i, "{:0.4f}".format(cm[i, j]), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") else: plt.text(j, i, "{:,}".format(cm[i, j]), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass)) plt.show() from sklearn.cross_validation import cross_val_score C_range=list(np.arange(0.1,6,0.1)) acc_score=[] for c in C_range: svc = SVC(kernel='linear', C=c) scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy') acc_score.append(scores.mean( )) print(acc_score) C_values=list(np.arange(1,26)) # plot the value of C for SVM (x-axis) versus the cross-validated accuracy (y-axis) plt.plot(C_range,acc_score) plt.xticks(np.arange(0.0,6,0.3)) plt.xlabel('Value of C for SVC ') plt.ylabel('Cross-Validated Accuracy') svm_model= SVC() list=['a','b','c'] list.append('d') from sklearn.grid_search import GridSearchCV tuned_parameters = { 'C': (np.arange(0.1,1,0.1)) , 'kernel': ['linear'], 'C': (np.arange(0.1,1,0.1)) , 'gamma': [0.01,0.02,0.03,0.04,0.05], 'kernel': ['rbf'], 'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05], 'C':(np.arange(0.1,1,0.1)) , 'kernel':['poly'] } model_svm = GridSearchCV(svm_model, tuned_parameters,cv=10,scoring='accuracy') model_svm.fit(X_train, y_train) print(model_svm.best_score_) print(model_svm.best_params_) from sklearn.tree import DecisionTreeClassifier from matplotlib.colors import ListedColormap def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02): # setup marker generator and color map markers = ('s', 'x', 'o', '^', 'v') colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan') cmap = ListedColormap(colors[:len(np.unique(y))]) # plot the decision surface x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution)) Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T) Z = Z.reshape(xx1.shape) plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) for idx, cl in enumerate(np.unique(y)): plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.6, c=cmap(idx), edgecolor='black', marker=markers[idx], label=cl) # highlight test samples if test_idx: # plot all samples if not versiontuple(np.__version__) >= versiontuple('1.9.0'): X_test, y_test = X[list(test_idx), :], y[list(test_idx)] warnings.warn('Please update to NumPy 1.9.0 or newer') else: X_test, y_test = X[test_idx, :], y[test_idx] plt.scatter(X_test[:, 0], X_test[:, 1], c='', alpha=1.0, edgecolor='black', linewidths=1, marker='o', s=55, label='test set') plot_decision_regions(X_train.values, y_train.values, classifier=tree) plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.tight_layout() plt.show() feature_names= tree.score(X_test,y_test) from sklearn.tree import export_graphviz import graphviz dot_data = tree.export_graphviz( tree, out_file=df, feature_names=df.columns, class_names=["0","1"], #filled=True, #rounded=True, #special_characters=True ) graph = graphviz.Source(dot_data) tree.feature_importances_ sns.violinplot(y="maxdom",x="label",data=df,hue="label",split="male") import pydotplus from sklearn.datasets import load_iris from sklearn import tree import collections X2 = [ [180,85, 15,0], [177,60, 42,1], [136,55, 35,2], [174,59, 65,1], [141,60, 28,2], [170,66,60,0]] X2 = [ [180,0], [177,0], [136,1], [174,0], [141,1], [170,1]] Y2 = ['man', 'woman', 'woman', 'man', 'woman', 'woman'] Y2 = [1, 0,0, 1, 0, 0] data_feature_names = [ 'height',"weight", 'hair length', 'voice' ] data_feature_names = [ 'height', 'voice' ] data_feature_names =df.columns.drop("label") data_feature_names clf = tree.DecisionTreeClassifier() aaaaa=y_train["type"] clf = clf.fit(X_train, aaaaa) export_graphviz(tree, out_file="mytree.dot",feature_names=X_train.columns) g=export_graphviz(tree,feature_names=data_feature_names, out_file="mytree.dot",filled=True, rounded=True, special_characters=True) with open("mytree.dot") as f: dot_graph = f.read() graphviz.Source(dot_graph) graph = pydotplus.graph_from_dot_data(dot_graph) dot_data = export_graphviz(clf, out_file=None, filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data) tree = DecisionTreeClassifier(criterion = 'gini', random_state=10) tree.fit(X_train,y_train) pretree=tree.predict(X_test) print(metrics.accuracy_score(y_test,pretree)) from sklearn.ensemble import RandomForestClassifier from sklearn.grid_search import GridSearchCV from sklearn import cross_validation, metrics rf = RandomForestClassifier(n_estimators = 10000, random_state = 100,max_depth=50) rf.fit(X_train,y_train) y_pred1 = rf.predict(X_test) a=pd.DataFrame(y_pred) a=round(a) print(metrics.accuracy_score(y_test,y_pred)) url="http://archive.ics.uci.edu/ml/machine-learning-databases/00198/Faults.NNA" df = pd.read_csv(url,sep="\t",header=None,names=name) url1="http://archive.ics.uci.edu/ml/machine-learning-databases/00198/Faults27x7_var" name=pd.read_csv(url1,header=None)[0] df["type"]=0 ind=np.where(df["Other_Faults"]==1) ind1=list(ind) df.loc[ind1[0],"type"]=6 df["type"]=df["type"].astype('category') X=df.iloc[:, :-8] y=df.iloc[:, -1:] param_test1 = {'max_depth':list(range(3,5,2)), 'min_samples_split':list(range(50,60,20))} param_test1 = {'max_depth':list(range(3,5,2))} gsearch1 = GridSearchCV(estimator = rf, param_grid = ,cv=5) gsearch1.fit(X_train,y_train) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ clf = GridSearchCV(RandomForestClassifier(n_estimators = 1000, random_state = 100), param_test1) clf.fit(X_train,aaaa) aaaa=y_train["type"] clf.grid_scores_, clf.best_params_, clf.best_score_ coef = pd.Series(tree.feature_importances_, index = X_train.columns) imp_coef = coef.sort_values() plt.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef.plot(kind = "barh") plt.title("Coefficients in the DT Model")