Thursday, March 27, 2014

Logistic Regression in R

Logistic regression relates a binary outcome such as purchase to predictors that may include continuous and factor variables, by modeling the variables' association with the probability of outcomes. 

A logistic regression model, also known as a logit model, is a member of the generalized linear models family. 

Coefficients in a logit model can be interpreted in terms of odds ratios, the degree to which they are associated with the increased or decrease likelihood of an outcome. 


A statistically significant result does not always mean that the model is appropriate. It is important to explore data thoroughly and to construct models on the basis of careful consideration. 

#########################################################
Fit with ratio and weights
#########################################################
data$network=as.factor(data$network)
data$Type=as.factor(data$Type)
model <- glm(Avg...Viewed ~Type, weights = Total.Hrs.Viewed, family=binomial, data=data)
summary(model)

## odds ratios only
exp(coef(model))
## CIs using profiled log-likelihood
confint(model)
## CIs using standard errors
confint.default(model)

wald.test(b = coef(model), Sigma = vcov(model), Terms = 2)


#########################################################
Fit with logical vertor of a two-level factor
#########################################################
hive -e 'SELECT freq1d, count(*), count(distinct user_id), sum(click), sum(action) from tmp5 group by freq1d order by freq1d;' > /homes/result1d.csv;

data <- read.table("result1d.csv", head =F);
names(data) <- c("freq", "imp", "uuser", "click", "action");
data$CTR <- data$click/data$imp;
data$CVR <- data$action/data$imp;
data$freq[data$action==6]

data1 <- data[1:49,];
data1$nonaction <- data1$imp-data1$action;
data1$freq1d <- as.factor(data1$freq);
lrfit <- glm(cbind(data1$action, data1$nonaction) ~ + data1$freq1d , family = binomial, data=data1);
summary(lrfit);


#########################################################
Fit downsampling data
#########################################################
train <- fread("train.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));
test <- fread("test.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));
setnames(train, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));
setnames(test, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));

train1 <- (train[ ,6:13, with=F]);
test1 <- (test[,6:13, with=F]);
train1$action <- factor(train1$action);
test1$action <- factor(test1$action);

# simple model;
lrfit <- glm(train1$action ~ ., family = binomial, data=train1);
summary(lrfit)
test1$score<-predict(lrfit,type='response',test1)
#calculate auc;
pred<-prediction(test1$score,test1$action);
auc.tmp <- performance(pred,"auc"); 
auc <- as.numeric(auc.tmp@y.values);
auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

# top 3 variables
#works by sorting coefficient terms in equation and selecting top 3 in sort for each loan scored
g<-predict(lrfit,type='terms',test1)
ftopk<- function(x,top=3){
  res=names(x)[order(x, decreasing = TRUE)][1:top]
  paste(res,collapse=";",sep="")
}
topk=apply(g,1,ftopk,top=3)
test1<-cbind(test1, topk)

# add rect interaction;
lrfit <- glm(train1$action ~ freq1d+freq3d+freq5d+freq1w+freq2w+freq3w+rect+freq1d:rect+freq3d:rect+freq5d:rect+freq1w:rect+freq2w:rect+freq3w:rect, family = binomial, data=train1);
summary(lrfit)
test1$score<-predict(lrfit,type='response',test1)
#calculate auc;
pred<-prediction(test1$score,test1$action);
auc.tmp <- performance(pred,"auc"); 
auc <- as.numeric(auc.tmp@y.values); auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

No comments:

Post a Comment