My DS Coding Bolg: Logistic Regression in R

Logistic regression relates a binary outcome such as purchase to predictors that may include continuous and factor variables, by modeling the variables' association with the probability of outcomes.

A logistic regression model, also known as a logit model, is a member of the generalized linear models family.

Coefficients in a logit model can be interpreted in terms of odds ratios, the degree to which they are associated with the increased or decrease likelihood of an outcome.

A statistically significant result does not always mean that the model is appropriate. It is important to explore data thoroughly and to construct models on the basis of careful consideration.

#########################################################
Fit with ratio and weights
#########################################################
data$network=as.factor(data$network)
data$Type=as.factor(data$Type)
model <- glm(Avg...Viewed ~Type, weights = Total.Hrs.Viewed, family=binomial, data=data)
summary(model)

## odds ratios only
exp(coef(model))
## CIs using profiled log-likelihood
confint(model)
## CIs using standard errors
confint.default(model)

wald.test(b = coef(model), Sigma = vcov(model), Terms = 2)

#########################################################
Fit with logical vertor of a two-level factor
#########################################################
hive -e 'SELECT freq1d, count(*), count(distinct user_id), sum(click), sum(action) from tmp5 group by freq1d order by freq1d;' > /homes/result1d.csv;

data <- read.table("result1d.csv", head =F);

names(data) <- c("freq", "imp", "uuser", "click", "action");

data$CTR <- data$click/data$imp;

data$CVR <- data$action/data$imp;

data$freq[data$action==6]

data1 <- data[1:49,];

data1$nonaction <- data1$imp-data1$action;

data1$freq1d <- as.factor(data1$freq);

lrfit <- glm(cbind(data1$action, data1$nonaction) ~ + data1$freq1d , family = binomial, data=data1);

summary(lrfit);

#########################################################

Fit downsampling data

#########################################################

train <- fread("train.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));

test <- fread("test.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));

setnames(train, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));

setnames(test, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));

train1 <- (train[ ,6:13, with=F]);

test1 <- (test[,6:13, with=F]);

train1$action <- factor(train1$action);

test1$action <- factor(test1$action);

# simple model;

lrfit <- glm(train1$action ~ ., family = binomial, data=train1);

summary(lrfit)

test1$score<-predict(lrfit,type='response',test1)

#calculate auc;

pred<-prediction(test1$score,test1$action);

auc.tmp <- performance(pred,"auc");

auc <- as.numeric(auc.tmp@y.values);

auc;

perf <- performance(pred,"tpr","fpr");

plot(perf);

# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.

max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

# top 3 variables

#works by sorting coefficient terms in equation and selecting top 3 in sort for each loan scored

g<-predict(lrfit,type='terms',test1)

ftopk<- function(x,top=3){

res=names(x)[order(x, decreasing = TRUE)][1:top]

paste(res,collapse=";",sep="")

}

topk=apply(g,1,ftopk,top=3)

test1<-cbind(test1, topk)

# add rect interaction;

lrfit <- glm(train1$action ~ freq1d+freq3d+freq5d+freq1w+freq2w+freq3w+rect+freq1d:rect+freq3d:rect+freq5d:rect+freq1w:rect+freq2w:rect+freq3w:rect, family = binomial, data=train1);

summary(lrfit)

test1$score<-predict(lrfit,type='response',test1)

#calculate auc;

pred<-prediction(test1$score,test1$action);

auc.tmp <- performance(pred,"auc");

auc <- as.numeric(auc.tmp@y.values); auc;

perf <- performance(pred,"tpr","fpr");

plot(perf);

# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.

max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

My DS Coding Bolg

Thursday, March 27, 2014

Logistic Regression in R

No comments:

Post a Comment

Blog Archive