import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
df = pd.read_csv('voice.csv')
X=df.iloc[:, :-1]
X1=df.drop(df.columns[[0,2,3,20]],axis=1)
Y=df.iloc[:, -1:]
gender_encoder = LabelEncoder()
y = gender_encoder.fit_transform(Y)
scaler = StandardScaler()
scaler.fit(X1)
X = scaler.transform(X1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=1)
svc.fit(X_train,y_train)
y_pred=svc.predict(X_train)
print('Accuracy Score:')
print(metrics.accuracy_score(y_train,y_pred))
a=confusion_matrix(y_pred1, y_test)
plot_confusion_matrix(cm = a,
normalize = False,
target_names = ['0', '1',"2","3","4","5","6"],
title = "Confusion Matrix")
def plot_confusion_matrix(cm,
target_names,
title='Confusion matrix',
cmap=None,
normalize=True):
import matplotlib.pyplot as plt
import numpy as np
import itertools
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize=(4, 3))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 12 if normalize else cm.max() / 12
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f};
misclass={:0.4f}'.format(accuracy, misclass))
plt.show()
from sklearn.cross_validation import cross_val_score
C_range=list(np.arange(0.1,6,0.1))
acc_score=[]
for c in C_range:
svc = SVC(kernel='linear', C=c)
scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
acc_score.append(scores.mean( ))
print(acc_score)
C_values=list(np.arange(1,26))
# plot the value of C for SVM (x-axis) versus the cross-validated accuracy
(y-axis)
plt.plot(C_range,acc_score)
plt.xticks(np.arange(0.0,6,0.3))
plt.xlabel('Value of C for SVC ')
plt.ylabel('Cross-Validated Accuracy')
svm_model= SVC()
list=['a','b','c']
list.append('d')
from sklearn.grid_search import GridSearchCV
tuned_parameters = {
'C': (np.arange(0.1,1,0.1)) , 'kernel': ['linear'],
'C': (np.arange(0.1,1,0.1)) , 'gamma': [0.01,0.02,0.03,0.04,0.05], 'kernel':
['rbf'],
'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05],
'C':(np.arange(0.1,1,0.1)) , 'kernel':['poly']
}
model_svm = GridSearchCV(svm_model, tuned_parameters,cv=10,scoring='accuracy')
model_svm.fit(X_train, y_train)
print(model_svm.best_score_)
print(model_svm.best_params_)
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.6,
c=cmap(idx),
edgecolor='black',
marker=markers[idx],
label=cl)
# highlight test samples
if test_idx:
# plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
alpha=1.0,
edgecolor='black',
linewidths=1,
marker='o',
s=55, label='test set')
plot_decision_regions(X_train.values, y_train.values, classifier=tree)
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
feature_names=
tree.score(X_test,y_test)
from sklearn.tree import export_graphviz
import graphviz
dot_data = tree.export_graphviz(
tree,
out_file=df,
feature_names=df.columns,
class_names=["0","1"],
#filled=True,
#rounded=True,
#special_characters=True
)
graph = graphviz.Source(dot_data)
tree.feature_importances_
sns.violinplot(y="maxdom",x="label",data=df,hue="label",split="male")
import pydotplus
from sklearn.datasets import load_iris
from sklearn import tree
import collections
X2 = [ [180,85, 15,0],
[177,60, 42,1],
[136,55, 35,2],
[174,59, 65,1],
[141,60, 28,2],
[170,66,60,0]]
X2 = [ [180,0],
[177,0],
[136,1],
[174,0],
[141,1],
[170,1]]
Y2 = ['man', 'woman', 'woman', 'man', 'woman', 'woman']
Y2 = [1, 0,0, 1, 0, 0]
data_feature_names = [ 'height',"weight", 'hair length', 'voice' ]
data_feature_names = [ 'height', 'voice' ]
data_feature_names =df.columns.drop("label")
data_feature_names
clf = tree.DecisionTreeClassifier()
aaaaa=y_train["type"]
clf = clf.fit(X_train, aaaaa)
export_graphviz(tree, out_file="mytree.dot",feature_names=X_train.columns)
g=export_graphviz(tree,feature_names=data_feature_names,
out_file="mytree.dot",filled=True, rounded=True,
special_characters=True)
with open("mytree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
graph = pydotplus.graph_from_dot_data(dot_graph)
dot_data = export_graphviz(clf,
out_file=None,
filled=True,
rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
tree = DecisionTreeClassifier(criterion = 'gini', random_state=10)
tree.fit(X_train,y_train)
pretree=tree.predict(X_test)
print(metrics.accuracy_score(y_test,pretree))
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation, metrics
rf = RandomForestClassifier(n_estimators = 10000, random_state =
100,max_depth=50)
rf.fit(X_train,y_train)
y_pred1 = rf.predict(X_test)
a=pd.DataFrame(y_pred)
a=round(a)
print(metrics.accuracy_score(y_test,y_pred))
url="http://archive.ics.uci.edu/ml/machine-learning-databases/00198/Faults.NNA"
df = pd.read_csv(url,sep="\t",header=None,names=name)
url1="http://archive.ics.uci.edu/ml/machine-learning-databases/00198/Faults27x7_var"
name=pd.read_csv(url1,header=None)[0]
df["type"]=0
ind=np.where(df["Other_Faults"]==1)
ind1=list(ind)
df.loc[ind1[0],"type"]=6
df["type"]=df["type"].astype('category')
X=df.iloc[:, :-8]
y=df.iloc[:, -1:]
param_test1 = {'max_depth':list(range(3,5,2)),
'min_samples_split':list(range(50,60,20))}
param_test1 = {'max_depth':list(range(3,5,2))}
gsearch1 = GridSearchCV(estimator = rf, param_grid =
,cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
clf = GridSearchCV(RandomForestClassifier(n_estimators = 1000, random_state =
100), param_test1)
clf.fit(X_train,aaaa)
aaaa=y_train["type"]
clf.grid_scores_, clf.best_params_, clf.best_score_
coef = pd.Series(tree.feature_importances_, index = X_train.columns)
imp_coef = coef.sort_values()
plt.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the DT Model")