Thursday, September 13, 2012

Compute AUC and KS Statistics

An ROC curve is used to represent the relationship of the true-positive rate(TPR) to the false-positive rate(FNR) of a test as the cutoff score varies. 

Changing the cutoff score changes the proportion oft he two types of error. For a response where a higher score indicates a higher likelihood to respond, choosing a high score as the cutoff means fewer false positive (people labeled as responders who do not respond) and more false negatives (people labeled as non-responders who would respond). 

Sensitivity is the proportion of true positives among people who get a positive result on a test (TPR). Sensitivity measures the likelihood that a diagnosis based on the test is correct. Recall measures the number of articles on the correct topic returned by a Web search or other text query. Recall is the same as TPR. 

Specificity is the proportion of true negatives among people who get a negative result on the test(TNR). 

Precision (TP/(TP+FP)) measures the percentage of the returned articles that are on the correct topic. 

The maximum benefit is proportional to the maximum distance between the cumulative distribution functions of the probabilities in each class. 


What this means is that the model score that cuts the prospect list at the penetration where the benefit is greatest is also the score that maximizes the Kolmogorov-Smirnov (KS) statistics. The KS test was developed as a test of whether two distributions are different. Splitting the list as the point of maximum benefit results in a 'good list' and a 'bad list' whose distributions of responders are maximally separate from each other and from the population. In this case, the 'good list' has a maximum proportion of responders and 'bad list' has a minimum proportion. 

#########################################################
## R Calculation
#########################################################
install.packages('verification')
library('verification')
library('zoo')

# Data used from Mason and Graham (2002).
a<- c(1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
      1991, 1992, 1993, 1994, 1995)
b<- c(0,0,0,1,1,1,0,1,1,0,0,0,0,1,1)
c<- c(.8, .8, 0, 1,1,.6, .4, .8, 0, 0, .2, 0, 0, 1,1)
d<- c(.928,.576, .008, .944, .832, .816, .136, .584, .032, .016, .28, .024, 0, .984, .952)

A<- data.frame(a,b,c,d)
names(A)<- c("year", "event", "p1", "p2")

## for model with ties
roc.area(A$event, A$p1)
##0.8392857

## for model without ties
roc.area(A$event, A$p2)
##0.875

ROC<- function(dat, score_col,class_col){
  as <- dat[,score_col];
  las <- length(as);
  aslab <- dat[, class_col];
  numpos <- sum(aslab == 1);
  numneg <- sum(aslab == 0);
  tmp <- sort.int(as, index.return = T);
  sas <- tmp$x;
  saslab <- aslab[tmp$ix];
 
  selpn <- saslab %in% c(1, 0);

  saspn <- sas[selpn];
  lsaspn <- length(saspn);
  saslabpn <- saslab[selpn];
  tmp <- sort.int(saspn, decreasing = T, index.return = T);
  saspnrev <- tmp$x;
  saslabpnnrev <- saslab[tmp$ix];
  cumsaslabpnnrev <- cumsum(saslabpnnrev);
 
  end <- length(saspnrev);
  difsaspnrev <- saspnrev[2:end] - saspnrev[1:(end-1)];
  difsaspnrev <- c(0, difsaspnrev);
  steps <- (difsaspnrev != 0) ;
  tmp <- 1:length(steps);
  steplocs <- c(1, tmp[steps], lsaspn+1);

  end <- length(steplocs);
  rl <- steplocs[2:end] - steplocs[1:(end-1)];
  csr <- cumsaslabpnnrev[steplocs[2:end]-1];
 
  end <- length(csr);
  npr <- c(csr[1], (csr[2:end] - csr[1:(end-1)]));
  nnr <- rl - npr
  npr <- c(0, npr);
  nnr <- c(0, nnr);
 
  fpr <- cumsum(nnr)/numneg;
  recall <- cumsum(npr)/numpos;
  roc <- data.frame(fpr, recall);
 
  a2 = sum(nnr * recall) / numneg;
  adjust = sum(nnr * npr) / (2*numpos*numneg);
  a3 = a2 - adjust;
  a1 = a3 - adjust;
  area =data.frame(a3, a1, a2);
 
  result <- list(area, roc);
  return(result);
}
dat <- A
score_col <- 3
class_col <-2
result <- ROC(A, 3,2);
result[[1]][1]
##0.8392857

result <- ROC(A, 4,2);
result[[1]][1]
##0.875

dat <- A
score_col <- 3
class_col <-2
AUC_yahoo <- function(dat, score_col,class_col){
  as <- dat[,score_col];
  las <- length(as);
  aslab <- dat[, class_col];
  numpos <- sum(aslab == 1);
  numneg <- sum(aslab == 0);
  possc <- as[aslab == 1];
  negsc <- as[aslab == 0];
  pgtn <- 0;
  peqn <- 0;
  for (inn in  1:numneg){
    for (ip in 1:numpos) {
      pgtn = pgtn + (possc[ip] > negsc[inn]);
      peqn = peqn + (possc[ip] == negsc[inn]);
    }
  }
  num = numpos * numneg;
  area1 = pgtn/num;
  area2 = (pgtn + peqn)/num;
  area3 = (pgtn + peqn/2)/num;
  area = data.frame(area3, area1, area2);
return(area);
}
dat <- A
score_col <- 3
class_col <-2
result <- AUC_yahoo(A, 3,2);
result[1]
##0.8392857

result <- AUC_yahoo(A, 4,2);
result[1]
##0.875

n<- length(A[,1])
conv <- length(A[A$event==1,1])
non_conv <- n-conv

#estimating ROC by p1;
A1 <- A[sort.list(A$p1, decreasing = T),]
A1$tp <- cumsum(A1$event)
A1$numberc <- 1:n
A1$fp <- A1$numberc - A1$tp
A1$percy <- A1$tp / conv
A1$percx <- A1$fp /non_conv
roc <- sum(diff(A1$percx[A1$numberc])*rollmean(A1$percy[A1$numberc],2)); roc
#0.8392857


#estimating ROC by p2;
A2 <- A[sort.list(A$p2, decreasing = T),]
A2$tp <- cumsum(A2$event)
A2$numberc <- 1:n
A2$fp <- A2$numberc - A2$tp
A2$percy <- A2$tp / conv
A2$percx <- A2$fp /non_conv
roc <- sum(diff(A2$percx[A2$numberc])*rollmean(A2$percy[A2$numberc],2)); roc
#0.875
icr <- rate1;
icr.new <- rate9;

icr$percx <- (icr$rank.1-icr$numberc)/ (max(icr$rank.1)-max(icr$numberc))
icr$percy <- icr$numberc/max(icr$numberc);

icr.new$percx <- (icr.new$rank.1-icr.new$numberc)/ (max(icr.new$rank.1)-max(icr.new$numberc))
icr.new$percy <- icr.new$numberc/max(icr.new$numberc);

ggplot(data=icr, aes(y=percy, x=percx)) +
  scale_x_continuous(limits=c(0,1)) +
  geom_line(aes(x=percx, y=percy),size = 1.5, color = "black") +
  geom_line(data=icr.new, aes(x=percx, y=percy),size = 1.5, color = "red") +
  geom_line(aes(x=percx, y=percx),size = 1.5, color = "black", linetype="dashed") +
  ylab("True Positive Rate") + xlab("False Positive Rate") +
  opts(axis.text.x = theme_text(face = "bold", size = 12)) +
  opts(axis.text.y = theme_text(face = "bold", size = 12)) +
  opts(axis.title.x  = theme_text(face = "bold", size = 12)) +
  opts(axis.title.y  = theme_text(face = "bold", size = 12, angle = 90)) +
  opts(title = paste("ROC Chart")) +
  opts(plot.title = theme_text(face = "bold", size=14))

# 0.7548072
roc <- sum(diff(icr$percx[icr$numberc])*rollmean(icr$percy[icr$numberc],2)); roc
# 0.7455777
roc.new <- sum(diff(icr.new$percx[icr.new$numberc])*rollmean(icr.new$percy[icr.new$numberc],2)); roc.new

## use ROCR
require(ROCR)
AUC <- function(pred,depvar){
  require("ROCR")
  p   <- ROCR::prediction(as.numeric(pred),depvar)
  auc <- ROCR::performance(p,"auc")
  auc <- unlist(slot(auc, "y.values"))
 
  perf <- ROCR::performance(p,"tpr","fpr");
  plot(perf);
  abline(0, 1, lty = 2);
 
  return(auc)
}

KS <- function(pred,depvar){
  require("ROCR")
  p   <- ROCR::prediction(as.numeric(pred),depvar)
  perf <- ROCR::performance(p, "tpr", "fpr")
  ks <- max(attr(perf, "y.values")[[1]] - (attr(perf, "x.values")[[1]]))
  return(ks)
}

test$score=factor(tmp, levels=c(FALSE, TRUE))
#calculate auc;
AUC(test$score, test$y)
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
KS(test$score, test$y)


#########################################################
## Python Calculation
#########################################################
#!/usr/bin/python3
import os
                           
aslab=[-1,-1,-1,1,1,1,-1,1,1,-1,-1,-1,-1,1,1]
ascore=[.928,.576, .008, .944, .832, .816, .136, .584, .032, .016, .28, .024, 0, .984, .952]

las =len(aslab)
numpos = 0
numneg = 0
possc=[]
negsc=[]
for i in range(0,len(aslab)):
    if aslab[i]==1:
        possc.append(ascore[i])
        numpos+=1
    elif aslab[i]==-1:
        negsc.append(ascore[i])
        numneg+=1
         
pgtn = 0
peqn = 0
for inn in range(0,numneg):
    for ip in range(0,numpos):
        pgtn = pgtn + (possc[ip] > negsc[inn])
        peqn = peqn + (possc[ip] == negsc[inn])
num = numpos * numneg
#area1 = pgtn/num
#area2 = (pgtn + peqn)/num
AUC = (pgtn + peqn/2)/num