#-------------------------------------------------------------------------------- # Load and preprocess the Wisconsin breast cancer data; for more info, refer to: # http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 rm(list=ls()) library("MASS"); data("biopsy"); biopsy = biopsy[,-1]; # remove sample id column biopsy = na.omit(biopsy); # remove rows w/ missing data #-------------------------------------------------------------------------------- # Load needed packages library('e1071'); # for naive Bayes and SVM library('rpart'); # for rpart (decision tree) library('ROCR'); # for model evaluation #-------------------------------------------------------------------------------- # Train various classifiers # Basic logistic regression (available through default packages) shown here; # other packages, like 'glmnet', are needed to get regularized models model.logit = glm(class ~ ., data=biopsy, family=binomial('logit')); # Naive Bayes classifier (from e1071 package) model.naive = naiveBayes(class ~ ., data=biopsy); # SVM classifiers (from e1071 package); svm() has many, many options, so # reading the documentation is highly recommended model.svm.linear = svm(class ~ ., data=biopsy, kernel='linear'); model.svm.radial = svm(class ~ ., data=biopsy, kernel='radial'); # Decision tree (using the CART algorithm) model.tree.info = rpart(class ~ ., data=biopsy, method='class', parms=list(split="information")); model.tree.gini = rpart(class ~ ., data=biopsy, method='class', parms=list(split="gini")); # Uncomment to see the tree structures # par(mfrow=c(1,2), xpd=NA); # plot(model.tree.info, uniform=TRUE, main='Split by info'); # text(model.tree.info, use.n=TRUE); # plot(model.tree.gini, uniform=TRUE, main='Split by GINI'); # text(model.tree.gini, use.n=TRUE); #-------------------------------------------------------------------------------- # Compare performance of various classifiers # Get a list of models that we've generated model.list = c('logit', 'naive', 'svm.linear', 'svm.radial', 'tree.info', 'tree.gini'); # Calculate accuracy by getting prediction class labels label.logit = function(model, names, cutoff=0.5) { pred = predict(model, biopsy, type='response'); x = ifelse(pred < cutoff, names[1], names[2]); return(factor(x, levels=names)); } pred.labels = data.frame( logit = label.logit(model.logit, names=levels(biopsy$class)), naive = predict(model.naive, biopsy, type='class'), svm.linear = predict(model.svm.linear, biopsy, decision.values=FALSE), svm.radial = predict(model.svm.radial, biopsy, decision.values=FALSE), tree.info = predict(model.tree.info, biopsy, type='class'), tree.gini = predict(model.tree.gini, biopsy, type='class') ); N = nrow(biopsy); cat('\n'); for(model in model.list) { tab = table(biopsy$class, pred.labels[,model]); n.correct = sum(diag(tab)); cat(sprintf('Accuracy of %-10s = %5.2f %%\n', model, 100*n.correct/N)); } cat('\n'); # Generate ROC plots score.svm = function(model) { pred = predict(model, biopsy, decision.values=TRUE); return(attr(pred, 'decision.values')[,1]); } scores = data.frame( logit = predict(model.logit, biopsy, type='response'), naive = predict(model.naive, biopsy, type='raw')[,'malignant'], svm.linear = -score.svm(model.svm.linear), svm.radial = -score.svm(model.svm.radial), tree.info = predict(model.tree.info, biopsy, type='prob')[,'malignant'], tree.gini = predict(model.tree.gini, biopsy, type='prob')[,'malignant'] ); aucs = numeric(); # Generate ROC plots color.list = c('black', 'red', 'blue', 'green4', 'magenta', 'orange'); n.models = length(model.list); for(i in 1:n.models) { pred = prediction(scores[,i], biopsy$class); perf = performance(pred, 'tpr', 'fpr'); aucs[i] = slot(performance(pred,'auc'), 'y.values')[[1]]; plot(perf, lwd=2.0, col=color.list[i], xlim=c(0,0.2), add=(i%%n.models!=1)); # start new plot only once. } saved.font = par(family='mono'); legend(x='bottomright', lwd=2.0, col=color.list, legend = sprintf('%-10s ; AUC = %.4f', model.list, aucs)); par(saved.font);