Machine Learning: Plot ROC and PR Curve for multi-classes classification
Situation: We want to plot the curves.
Why: Because the accuracy score is too high and the confusion matrix shows some bias.
Steps:
- Import libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, confusion_matrix, roc_curve, auc, log_loss
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
import seaborn as sn
from sklearn.svm import SVC
from xgboost import XGBClassifier
2. Create 3 functions: plot_roc_curve, plot_precision_recall_curve, and plot_confusion_matrix.
Inside the functions to plot ROC and PR curves, We use OneHotEncoder and OneVsRestClassifier.
def plot_roc_curve(X, y, _classifier, caller):
# keep the algorithm's name to be written down into the graph
algor_name = type(_classifier).__name__
# put y into multiple columns for OneVsRestClassifier
onehotencoder = OneHotEncoder()
y_hat = onehotencoder.fit_transform(y.reshape(-1,1)).toarray()
n_classes = y_hat.shape[1]# split train/test set
X_train, X_test, y_train, y_test = train_test_split(X, y_hat, test_size = 0.3, random_state = 5)# For each classifier, the class is fitted against all the other classes
clf_ovr = OneVsRestClassifier(_classifier)
clf_ovr.fit(X_train, y_train)
y_proba = clf_ovr.predict_proba(X_test)
# Compute ROC curve and ROC area for each class
fig = plt.figure()
plt.style.use('default')
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
plt.plot(fpr[i], tpr[i], lw=2, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="lower right", prop={'size': 10})
plt.title('ROC to multi-class: ' + caller)
plt.suptitle(algor_name, fontsize=16)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()def plot_precision_recall_curve(X, y, _classifier, caller):
# keep the algorithm's name to be written down into the graph
algor_name = type(_classifier).__name__
# put y into multiple columns for OneVsRestClassifier
onehotencoder = OneHotEncoder()
y_hat = onehotencoder.fit_transform(y.reshape(-1,1)).toarray()
n_classes = y_hat.shape[1]# split train/test set
X_train, X_test, y_train, y_test = train_test_split(X, y_hat, test_size = 0.3, random_state = 5)# For each classifier, the class is fitted against all the other classes
clf_ovr = OneVsRestClassifier(_classifier)
clf_ovr.fit(X_train, y_train)
y_proba = clf_ovr.predict_proba(X_test)
# Compute ROC curve and ROC area for each class
fig = plt.figure()
plt.style.use('default')
precision = dict()
recall = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_proba[:, i])
plt.plot(recall[i], precision[i], lw=2, label='PR Curve of class {}'.format(i))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="lower right", prop={'size': 10})
plt.title('Precision-Recall to multi-class: ' + caller)
plt.suptitle(algor_name, fontsize=16)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()def plot_confusion_matrix(cfm, y_test, caller, algor_name):
# plot confusion_matrix
df_cm = pd.DataFrame(cfm, columns=np.unique(y_test), index = np.unique(y_test))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
fig = plt.figure()
plt.title('Confusion Matrix: ' + caller, fontsize=14)
plt.suptitle(algor_name, fontsize=16)
plt.style.use('default')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
sn.set(font_scale=1.4)
sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 10})
plt.show()
3. Load Iris data set.
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
4. Split train and test parts.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 5)
5. Train the LogisticRegression model.
clf = LogisticRegression(max_iter=50, solver = 'lbfgs')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
6. Print scores.
algor_name = type(clf).__name__
caller = 'Iris dataset'rpt = classification_report(y_test, y_pred)
cfm = confusion_matrix(y_pred, y_test)print("accuracy: {}".format(round(accuracy_score(y_test, y_pred), 3)))
print("log loss: {}".format(round(log_loss(y_test, y_proba), 3)))
print(rpt)
print(cfm)
7. Plot graphs.
try:
plot_confusion_matrix(cfm, y_test, caller, algor_name)
except ValueError:
print("Error: cannot plot the confusion matrix.")# Need more research to plot the ROC and PR curve for XGBoost and SVC
if not isinstance(clf, (XGBClassifier, SVC)):
try:
plot_roc_curve(X, y, clf, caller)
except ValueError:
print("Error: cannot plot the ROC Curve.")
try:
plot_precision_recall_curve(X, y, clf, caller)
except ValueError:
print("Error: cannot plot the PR Curve.")
8. We can see from the graphs that the prediction is pretty bad.
Please research more about how the ROC and PR graphs should be.
Refrence:
ROC Curve: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
ROC Curve (Multi-Classes): https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
PR Curve: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html