Wednesday, June 18, 2014

Logistic Regression Model Calibration in R

all <- fread("all.txt", head=F, sep='\t')
names=c("HasClick", "PredScore", "Score");
setnames(all, names);
setkey(all, Score);
all[HasClick==-1, HasClick:=0];
all[,list(Impressions=.N, CTR=sum(HasClick)/.N), by=HasClick][order(-CTR)];
summary(all$HasClick)

#PredScore
all[PredScore==-1, PredScore:=0];
summary(all$PredScor)
pred<-prediction(all$PredScore,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.6567984
#[1] /homes/lingh/all.txt
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.244848

#p Score
all$Score <- all$Score
all$p <- 1/(1+exp(-all$Score))
summary(all$p)
pred<-prediction(all$p,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
#[1] 0.6919405
#[1] 0.6992571
#[1] 0.7176237
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409
#[1] 0.3065308
#[1] 0.3129876
#[1] 0.3337304


# prior for bias correction
totimp <- nrow(all)
totclk <- sum(all$HasClick==1)
t <- totclk/totimp; t
p <-mean(all$p); p
all$p2 <- 1/(1+exp(-all$Score+log((1-t)*p/(t*(1-p)))));
mean(all$p2)
summary(all$p2)
pred<-prediction(all$p2,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
#[1] 0.6919405
[1] 0.6992571
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409
#[1] 0.3065308
[1] 0.3129876


# ensemble
all$p3 <- (all$p2+all$PredScore)/2
summary(all$p3)
pred<-prediction(all$p3,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.6996736
#[1] 0.6784338
[1] 0.6973308
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3098282
#[1] 0.2729899
[1] 0.3105527

clkinbin <- round(totclk/100); clkinbin
clkinbin <- 5
grp <- 1
tmp <- rep(1, totimp)
cnt <- 0
for (i in 1:totimp){
#for (i in 1:100000){
  if (all$HasClick[i]==1){
    cnt=cnt+1;
  }
  if (cnt==clkinbin){
    grp=grp+1;
    cnt=0;
  }
  tmp[i]=grp
}
all$grp=tmp
table(all$grp)
#all[grp==108, grp:=107];

groups=all[,list(Impressions=.N, Clicks=sum(HasClick), CTR=sum(HasClick)/.N,
                 AvgScore1=mean(PredScore),
                 AvgScore2=mean(p),
                 AvgScore3=mean(p2),
                 AvgScore4=mean(p3)),        
by=grp][order(grp)];
#write.csv(groups, "junk.csv")
with(groups, cor(CTR, AvgScore1))
#[1] 0.8853131
with(groups, cor(CTR, AvgScore2))
#[1] 0.87801
with(groups, cor(CTR, AvgScore3))
#[1] 0.8791485
with(groups, cor(CTR, AvgScore4))
# [1] 0.8803879

data <- groups
ggpairs(data[, c("CTR", "AvgScore1", "AvgScore2", "AvgScore3", "AvgScore4"), with=F])

sum((groups$AvgScore1-groups$CTR)^2)/length(groups$AvgScore1)
#[1] 2.920109e-06
sum((groups$AvgScore2-groups$CTR)^2)/length(groups$AvgScore2)
#[1] 0.0001156954
sum((groups$AvgScore3-groups$CTR)^2)/length(groups$AvgScore3)
#[1] 0.0001410945
#[1] 0.0001240449
sum((groups$AvgScore4-groups$CTR)^2)/length(groups$AvgScore4)
#[1] 4.574389e-05
#[1] 4.09054e-05


data<-groups
ggplot(data, aes(y=data$CTR, x=data$grp)) +
  geom_line(data=data, aes(y=data$CTR, x=data$grp), color='black', linetype=1, size = 1)  +
  geom_line(data=data, aes(y=data$AvgScore1, x=data$grp), color='red', linetype=1, size = 1)  +
#  geom_line(data=data, aes(y=data$AvgScore2, x=data$grp), color='purple', linetype=1, size = 1)  +
  geom_line(data=data, aes(y=data$AvgScore3, x=data$grp), color='darkgreen', linetype=1, size = 1)  +
  geom_line(data=data, aes(y=data$AvgScore4, x=data$grp), color='darkblue', linetype=1, size = 1)  +
  scale_x_continuous(limits=c(1, 100)) +
  scale_y_continuous(limits=c(0, 0.005)) +
  ggtitle("Binning Avg of CTR, HModel and VWModel") +
  xlab("Bin") + ylab("CTR") +
  theme(plot.title = element_text(face = "bold", size = 20)) +
  theme(axis.text.x = element_text(face = "bold", size = 16)) +
  theme(axis.text.y = element_text(face = "bold", size = 16)) +
  theme(axis.title.x = element_text(face = "bold", size = 16)) +
  theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
  theme(legend.position = "top") +
  theme(legend.key = element_rect(colour = NA)) +
  theme(legend.title = element_blank())


# fit linear model
data<-groups[1:107]
with(data, cor(CTR, AvgScore1))
with(data, cor(CTR, AvgScore3))
m <- lm(CTR ~ AvgScore3, data = groups)
data$tmp <- 0.0005360169 + 0.0717973293*data$AvgScore3
sum((data$tmp-data$CTR)^2)/length(data$tmp)
# [1] 2.400827e-07

# fit tobit model
summary(m <- vglm(CTR ~ AvgScore3, tobit(Upper = 0.05), data = groups))
ctable <- coef(summary(m))
pvals <- 2 * pt(abs(ctable[, "z value"]), df.residual(m), lower.tail = FALSE)
cbind(ctable, pvals)
data$tmp <- 0.0005360131 + 0.0717974011*data$AvgScore3  
sum((data$tmp-data$CTR)^2)/length(data$tmp)


# fit loess smoother
m <- loess(CTR ~ AvgScore3, data = groups)
groups$tmp <- m$fitted
sum((groups$tmp-groups$CTR)^2)/length(groups$tmp)
# [1] 1.765797e-07


# fit Iso
install.packages("Iso")
require(Iso)
z = ufit(groups$CTR, groups$AvgScore3, type='b')
z = pava(groups$AvgScore3)
groups$tmp <- z
sum((groups$tmp-groups$CTR)^2)/length(groups$tmp)
#0.0001410945

data <- groups[1:105]
ggplot(data, aes(y=data$CTR, x=data$AvgScore3)) +
  geom_point(data=data, aes(y=data$CTR, x=data$AvgScore3), color='blue', shape=3, size = 3)  +
  geom_line(data=data, aes(y=data$AvgScore3, x=data$AvgScore3), color='black', linetype=3, size = 1)  +
  scale_x_continuous(limits=c(0, 0.0015)) +
  scale_y_continuous(limits=c(0, 0.0015)) +
  ggtitle("Distribution of CTR Vs LRScore") +
  xlab("LRScore") + ylab("CTR") +
  theme(plot.title = element_text(face = "bold", size = 20)) +
  theme(axis.text.x = element_text(face = "bold", size = 16)) +
  theme(axis.text.y = element_text(face = "bold", size = 16)) +
  theme(axis.title.x = element_text(face = "bold", size = 16)) +
  theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
  theme(legend.position = "top") +
  theme(legend.key = element_rect(colour = NA)) +
  theme(legend.title = element_blank())

groups$nonClicks <-  groups$Impressions-groups$Clicks
zfit <- glm(cbind(groups$Clicks, groups$nonClicks ) ~ + data1$AvgScore3 , family = binomial, data=groups);
summary(lrfit);

glm.out = glm(cbind(Clicks, nonclick) ~ AvgScore2, family=binomial(logit), data=data)
plot(Clicks/Impressions ~ AvgScore2, data=data)
lines(data$AvgScore2, glm.out$fitted, type="l", col="red")
title(main="Group Data with Fitted Logistic Regression Line")

all$p3<-1/(1+exp(-all$Score*7.16219+8.82040))
summary(all$p3)
pred<-prediction(all$p3,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409

Logistic Regression with Penalty Factor in R

The main goal of penalty is to minimize the sum of the unpenalized cost function plus the penalty term, which can be understood as adding bias and preferring a simpler model to reduce the variance in the absence of sufficient training data to fit the model.

- The ridge penalty shrinks the coefficients of correlated predictors towards each other (L2)
- The lasso tends to pick one of them and discard the others (L1)
- The elastic-net penalty mixes these two.

If predictors are correlated in groups, an 

 is for numerical stability; for example, the elastic net with  for some small  performs much like the lasso, but removes any degeneracies and wild behavior caused by extreme correlations.

## http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html#log
install.packages("glmnet", repos = "http://cran.us.r-project.org")
require(glmnet)

load("BinomialExample.RData")
fit = glmnet(x, y, family = "binomial")
summary(fit)
plot(fit, xvar = "dev", label = TRUE)

predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01))
cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class")
plot(cvfit)
cvfit$lambda.min
cvfit$lambda.1se
coef(cvfit, s = "lambda.min")

predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")