My DS Coding Bolg

Wednesday, June 18, 2014

Logistic Regression Model Calibration in R

all <- fread("all.txt", head=F, sep='\t')
names=c("HasClick", "PredScore", "Score");
setnames(all, names);
setkey(all, Score);
all[HasClick==-1, HasClick:=0];
all[,list(Impressions=.N, CTR=sum(HasClick)/.N), by=HasClick][order(-CTR)];
summary(all$HasClick)

#PredScore
all[PredScore==-1, PredScore:=0];
summary(all$PredScor)
pred<-prediction(all$PredScore,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.6567984
#[1] /homes/lingh/all.txt
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.244848

#p Score
all$Score <- all$Score
all$p <- 1/(1+exp(-all$Score))
summary(all$p)
pred<-prediction(all$p,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
#[1] 0.6919405
#[1] 0.6992571
#[1] 0.7176237
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409
#[1] 0.3065308
#[1] 0.3129876
#[1] 0.3337304

# prior for bias correction
totimp <- nrow(all)
totclk <- sum(all$HasClick==1)
t <- totclk/totimp; t
p <-mean(all$p); p
all$p2 <- 1/(1+exp(-all$Score+log((1-t)*p/(t*(1-p)))));
mean(all$p2)
summary(all$p2)
pred<-prediction(all$p2,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
#[1] 0.6919405
[1] 0.6992571
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409
#[1] 0.3065308
[1] 0.3129876

# ensemble
all$p3 <- (all$p2+all$PredScore)/2
summary(all$p3)
pred<-prediction(all$p3,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.6996736
#[1] 0.6784338
[1] 0.6973308
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3098282
#[1] 0.2729899
[1] 0.3105527

clkinbin <- round(totclk/100); clkinbin
clkinbin <- 5
grp <- 1
tmp <- rep(1, totimp)
cnt <- 0
for (i in 1:totimp){
#for (i in 1:100000){
if (all$HasClick[i]==1){
cnt=cnt+1;
}
if (cnt==clkinbin){
grp=grp+1;
cnt=0;
}
tmp[i]=grp
}
all$grp=tmp
table(all$grp)
#all[grp==108, grp:=107];

groups=all[,list(Impressions=.N, Clicks=sum(HasClick), CTR=sum(HasClick)/.N,
AvgScore1=mean(PredScore),
AvgScore2=mean(p),
AvgScore3=mean(p2),
AvgScore4=mean(p3)),
by=grp][order(grp)];
#write.csv(groups, "junk.csv")
with(groups, cor(CTR, AvgScore1))
#[1] 0.8853131
with(groups, cor(CTR, AvgScore2))
#[1] 0.87801
with(groups, cor(CTR, AvgScore3))
#[1] 0.8791485
with(groups, cor(CTR, AvgScore4))
# [1] 0.8803879

data <- groups
ggpairs(data[, c("CTR", "AvgScore1", "AvgScore2", "AvgScore3", "AvgScore4"), with=F])

sum((groups$AvgScore1-groups$CTR)^2)/length(groups$AvgScore1)
#[1] 2.920109e-06
sum((groups$AvgScore2-groups$CTR)^2)/length(groups$AvgScore2)
#[1] 0.0001156954
sum((groups$AvgScore3-groups$CTR)^2)/length(groups$AvgScore3)
#[1] 0.0001410945
#[1] 0.0001240449
sum((groups$AvgScore4-groups$CTR)^2)/length(groups$AvgScore4)
#[1] 4.574389e-05
#[1] 4.09054e-05

data<-groups
ggplot(data, aes(y=data$CTR, x=data$grp)) +
geom_line(data=data, aes(y=data$CTR, x=data$grp), color='black', linetype=1, size = 1) +
geom_line(data=data, aes(y=data$AvgScore1, x=data$grp), color='red', linetype=1, size = 1) +
# geom_line(data=data, aes(y=data$AvgScore2, x=data$grp), color='purple', linetype=1, size = 1) +
geom_line(data=data, aes(y=data$AvgScore3, x=data$grp), color='darkgreen', linetype=1, size = 1) +
geom_line(data=data, aes(y=data$AvgScore4, x=data$grp), color='darkblue', linetype=1, size = 1) +
scale_x_continuous(limits=c(1, 100)) +
scale_y_continuous(limits=c(0, 0.005)) +
ggtitle("Binning Avg of CTR, HModel and VWModel") +
xlab("Bin") + ylab("CTR") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank())

# fit linear model
data<-groups[1:107]
with(data, cor(CTR, AvgScore1))
with(data, cor(CTR, AvgScore3))
m <- lm(CTR ~ AvgScore3, data = groups)
data$tmp <- 0.0005360169 + 0.0717973293*data$AvgScore3
sum((data$tmp-data$CTR)^2)/length(data$tmp)
# [1] 2.400827e-07

# fit tobit model
summary(m <- vglm(CTR ~ AvgScore3, tobit(Upper = 0.05), data = groups))
ctable <- coef(summary(m))
pvals <- 2 * pt(abs(ctable[, "z value"]), df.residual(m), lower.tail = FALSE)
cbind(ctable, pvals)
data$tmp <- 0.0005360131 + 0.0717974011*data$AvgScore3
sum((data$tmp-data$CTR)^2)/length(data$tmp)

# fit loess smoother
m <- loess(CTR ~ AvgScore3, data = groups)
groups$tmp <- m$fitted
sum((groups$tmp-groups$CTR)^2)/length(groups$tmp)
# [1] 1.765797e-07

# fit Iso
install.packages("Iso")
require(Iso)
z = ufit(groups$CTR, groups$AvgScore3, type='b')
z = pava(groups$AvgScore3)
groups$tmp <- z
sum((groups$tmp-groups$CTR)^2)/length(groups$tmp)
#0.0001410945

data <- groups[1:105]
ggplot(data, aes(y=data$CTR, x=data$AvgScore3)) +
geom_point(data=data, aes(y=data$CTR, x=data$AvgScore3), color='blue', shape=3, size = 3) +
geom_line(data=data, aes(y=data$AvgScore3, x=data$AvgScore3), color='black', linetype=3, size = 1) +
scale_x_continuous(limits=c(0, 0.0015)) +
scale_y_continuous(limits=c(0, 0.0015)) +
ggtitle("Distribution of CTR Vs LRScore") +
xlab("LRScore") + ylab("CTR") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank())

groups$nonClicks <- groups$Impressions-groups$Clicks
zfit <- glm(cbind(groups$Clicks, groups$nonClicks ) ~ + data1$AvgScore3 , family = binomial, data=groups);
summary(lrfit);

glm.out = glm(cbind(Clicks, nonclick) ~ AvgScore2, family=binomial(logit), data=data)
plot(Clicks/Impressions ~ AvgScore2, data=data)
lines(data$AvgScore2, glm.out$fitted, type="l", col="red")
title(main="Group Data with Fitted Logistic Regression Line")

all$p3<-1/(1+exp(-all$Score*7.16219+8.82040))
summary(all$p3)
pred<-prediction(all$p3,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409

Logistic Regression with Penalty Factor in R

The main goal of penalty is to minimize the sum of the unpenalized cost function plus the penalty term, which can be understood as adding bias and preferring a simpler model to reduce the variance in the absence of sufficient training data to fit the model.

- The ridge penalty shrinks the coefficients of correlated predictors towards each other (L2)
- The lasso tends to pick one of them and discard the others (L1)
- The elastic-net penalty mixes these two.

If predictors are correlated in groups, an

α

α

α

is for numerical stability; for example, the elastic net with

α = 1 - ϵ

for some small

ϵ > 0

performs much like the lasso, but removes any degeneracies and wild behavior caused by extreme correlations.

## http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html#log
install.packages("glmnet", repos = "http://cran.us.r-project.org")
require(glmnet)

load("BinomialExample.RData")
fit = glmnet(x, y, family = "binomial")
summary(fit)
plot(fit, xvar = "dev", label = TRUE)

predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01))
cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class")
plot(cvfit)
cvfit$lambda.min
cvfit$lambda.1se
coef(cvfit, s = "lambda.min")

predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")

Thursday, March 27, 2014

Logistic Regression in R

Logistic regression relates a binary outcome such as purchase to predictors that may include continuous and factor variables, by modeling the variables' association with the probability of outcomes.

A logistic regression model, also known as a logit model, is a member of the generalized linear models family.

Coefficients in a logit model can be interpreted in terms of odds ratios, the degree to which they are associated with the increased or decrease likelihood of an outcome.

A statistically significant result does not always mean that the model is appropriate. It is important to explore data thoroughly and to construct models on the basis of careful consideration.

#########################################################
Fit with ratio and weights
#########################################################
data$network=as.factor(data$network)
data$Type=as.factor(data$Type)
model <- glm(Avg...Viewed ~Type, weights = Total.Hrs.Viewed, family=binomial, data=data)
summary(model)

## odds ratios only
exp(coef(model))
## CIs using profiled log-likelihood
confint(model)
## CIs using standard errors
confint.default(model)

wald.test(b = coef(model), Sigma = vcov(model), Terms = 2)

#########################################################
Fit with logical vertor of a two-level factor
#########################################################
hive -e 'SELECT freq1d, count(*), count(distinct user_id), sum(click), sum(action) from tmp5 group by freq1d order by freq1d;' > /homes/result1d.csv;

data <- read.table("result1d.csv", head =F);

names(data) <- c("freq", "imp", "uuser", "click", "action");

data$CTR <- data$click/data$imp;

data$CVR <- data$action/data$imp;

data$freq[data$action==6]

data1 <- data[1:49,];

data1$nonaction <- data1$imp-data1$action;

data1$freq1d <- as.factor(data1$freq);

lrfit <- glm(cbind(data1$action, data1$nonaction) ~ + data1$freq1d , family = binomial, data=data1);

summary(lrfit);

#########################################################

Fit downsampling data

#########################################################

train <- fread("train.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));

test <- fread("test.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));

setnames(train, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));

setnames(test, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));

train1 <- (train[ ,6:13, with=F]);

test1 <- (test[,6:13, with=F]);

train1$action <- factor(train1$action);

test1$action <- factor(test1$action);

# simple model;

lrfit <- glm(train1$action ~ ., family = binomial, data=train1);

summary(lrfit)

test1$score<-predict(lrfit,type='response',test1)

#calculate auc;

pred<-prediction(test1$score,test1$action);

auc.tmp <- performance(pred,"auc");

auc <- as.numeric(auc.tmp@y.values);

auc;

perf <- performance(pred,"tpr","fpr");

plot(perf);

# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.

max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

# top 3 variables

#works by sorting coefficient terms in equation and selecting top 3 in sort for each loan scored

g<-predict(lrfit,type='terms',test1)

ftopk<- function(x,top=3){

res=names(x)[order(x, decreasing = TRUE)][1:top]

paste(res,collapse=";",sep="")

}

topk=apply(g,1,ftopk,top=3)

test1<-cbind(test1, topk)

# add rect interaction;

lrfit <- glm(train1$action ~ freq1d+freq3d+freq5d+freq1w+freq2w+freq3w+rect+freq1d:rect+freq3d:rect+freq5d:rect+freq1w:rect+freq2w:rect+freq3w:rect, family = binomial, data=train1);

summary(lrfit)

test1$score<-predict(lrfit,type='response',test1)

#calculate auc;

pred<-prediction(test1$score,test1$action);

auc.tmp <- performance(pred,"auc");

auc <- as.numeric(auc.tmp@y.values); auc;

perf <- performance(pred,"tpr","fpr");

plot(perf);

# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.

max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

Tuesday, March 11, 2014

Basic Markov model in R

library("msm")
result4$user_cumcnt <- as.integer(result4$user_cumcnt)
result4$user_cnt <- as.integer(result4$user_cnt)
basic <- data.frame(result4[result4$user_cnt<=60, c(1,5,15,18)])

raw <- statetable.msm(category, userid, data = basic)
write.csv(raw, file= "raw_state.csv")

tmp <- melt(raw)
tmp$to <- as.integer(tmp$to)

tmp2<-cbind(1:23,levels(basic$category))
write.csv(tmp, "data.csv")
write.csv(tmp2, "junk.csv")

#add censor state;
tmp1 <- ddply(basic, .(userid), function(x) data.frame(user_cumcnt = max(x$user_cumcnt)+1))
tmp2 <- unique(basic[,c(1,4)])
tmp3 <- merge(tmp1,tmp2, by = "userid")
tmp4 <- data.frame(tmp3, rep("End", length(tmp1[,1])))
names(tmp4)[4] <- "category"
tmp5 <- data.frame(rbind(basic, tmp4))
basic <- tmp5[order(tmp5$userid, tmp5$user_cumcnt), ]

raw <- ifelse(raw>0, .09, 0)
diag(raw) <- .8
raw <- cbind(raw, rep(0,23))
raw <- rbind(raw, rep(0,24))
dimnames(raw)[[1]][24] <- "24"
dimnames(raw)[[2]][24] <- "End"
twoway4.q <- raw
twoway4.q <- matrix(rep(1, 24*24), nrow=24)
diag(twoway4.q) <- 0
twoway4.q[,24] <- 1
twoway4.q[24,] <- 0

basic$category <- as.integer(basic$category)

Sys.time()
visits.msm <- msm(category ~ user_cumcnt, subject = userid, data = basic, qmatrix = twoway4.q, death = 24)
Sys.time()

Thursday, March 6, 2014

data.table in R

require(plyr)
require(data.table)

# Making a data.table from vectors
dt.data <- data.table(a=1:5,b=1:5)

rm(list=ls(all=T))
setwd("DSP data");
data <- fread("out1000.csv", colClasses=c("character", "integer", "integer", "integer", "integer") );
setnames(data, c("user_id", "date", "hour", "click", "action"));

# regular expression
regexp <- "([[:digit:]]+)"
month <- as.integer(str_extract(data$date, regexp))

# automatically sort by key
setkey(data, user_id, date, hour)

# add cols
data[,id:=1:length(data$user_id)];
data[,bcook:=as.factor(data$user_id)];
hflights_dt <- data.table(hflights)
hflights_dt[, DistanceKMs := Distance / 0.62137]
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
hflights_dt[, c('DistanceKMs', 'DistanceFeets') := list(Distance/0.62137, Distance * 5280)];

# filter rows
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
rows = sample(1:n,n*.0001);
results[,rowid:=1:n];
samples=results[results$rowid %in% rows];
test <- data[user_id == users[1],];

# filter cols
test <- data[,c(2,3,4,5,6), with=FALSE]; #filter cols
train3 <- train3[,ScoreLevel:=NULL] #drop cols

# Basic Stat
system.time(
stats <- ddply(data, ~ date,
                 summarize, imp=length(imp), click=sum(click), action=sum(action))
)
system.time(
stats <- data[,list(imps=.N,
                    clicks=sum(click),
                    actions=sum(action)), by='date'];
)

#Drop outliers
data2 <- read.csv(file="data_extracted_filtered.txt",head=TRUE,sep=",",row.names=NULL,as.is=TRUE);
for (i in c(2:7)){
print('----------------------------------------------------------');
print(i);
data2 <- data2[data2[,i]<=quantile[i-1],]
}
quantile = rep(0, 6);
for (i in c(2:7)){
print('----------------------------------------------------------');
print(i);
quantile[i-1] <- max(data2[,i], .95)
}

# Apply same function to all cols
func1 <- function(x){
return(log(x+1)/log(max(x)))
}
data3 <- data.table(data2)
data3 <- data3[,id:=NULL]
data3[, 1:6 := lapply(.SD, func1), .SDcols = 1:6]

# Basic Merge
keycols= c("start_dt","variable")
setkeyv(top10thisyearlong, keycols)
setkeyv(top10lastyearlong, keycols)
top10all = merge(top10thisyearlong, top10lastyearlong, by=c("start_dt", "variable"))

# compute freq1, and add a new column with name of 'freq1' to the original table
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
setkey(data,bcook);
f_freq <- function(dd) {
setkey(dd,bcook);
user.merge <- merge(data, dd, by=c('bcook'));
user.merge <- user.merge[id.x>id.y, ];
user.merge$gap <- (user.merge$date.x - user.merge$date.y)*24 + (user.merge$hour.x - user.merge$hour.y);
freq<- user.merge[, list(freq1d=sum(gap<24)), by=id.x];
return(c(0, freq$freq1d));
}

sink("myfile", append=FALSE, split=FALSE);
train1[HasClick==-1, HasClick:=0];
idx <- names(train1)
for (i in c(1:36)){
print('----------------------------------------------------------');
print(i);
print(train1[,list(Impressions=.N, Clicks=sum(HasClick), CTR=sum(HasClick)/.N), by=eval(idx[i])][order(-CTR)]);
}
# return output to the terminal
sink()

## Merge multiple data frames
list.of.data.frames = list(dat1, dat2, dat3, dat4, dat5, dat6, dat7)
merged.data.frame = Reduce(function(...) merge(..., all=T), list.of.data.frames)
tail(merged.data.frame)

Friday, February 28, 2014

Show Images in R

setwd("~/Desktop/Trend Reports/TrendAnalytics Reports/TrendPulse/10_16_2016/")
data=fread("keyword socialstream.csv",header = FALSE)
names(data) = c('seq','keywords', 'brands', 'categories')

# Outerwear
list = c(
10,
11,
27,
29,
30,
31,
32,
33,
34,
35
)
pdf("TopKeyword.pdf", width=15, height=8.5)
par(mfrow=c(2,3))
j=1
for(i in list){
img <- readPNG(paste(i, ".PNG", sep=""))

plot.new()
plot.window(xlim=c(0,1), ylim=c(0,1), asp=1)
rasterImage(img,0,0,1,1)
label=paste(j, data$categories[data$seq==i],"\n",
data$brands[data$seq==i], "\n",
data$keyword[data$seq==i])
title(main=label, font.main=2, line=0)

print(paste("img", j, ".jpg", sep=""))
j=j+1
}
dev.off()

Saturday, October 26, 2013

Visual of Tree by D3.js