My DS Coding Bolg: March 2014

Thursday, March 27, 2014

Logistic Regression in R

Logistic regression relates a binary outcome such as purchase to predictors that may include continuous and factor variables, by modeling the variables' association with the probability of outcomes.

A logistic regression model, also known as a logit model, is a member of the generalized linear models family.

Coefficients in a logit model can be interpreted in terms of odds ratios, the degree to which they are associated with the increased or decrease likelihood of an outcome.

A statistically significant result does not always mean that the model is appropriate. It is important to explore data thoroughly and to construct models on the basis of careful consideration.

#########################################################
Fit with ratio and weights
#########################################################
data$network=as.factor(data$network)
data$Type=as.factor(data$Type)
model <- glm(Avg...Viewed ~Type, weights = Total.Hrs.Viewed, family=binomial, data=data)
summary(model)

## odds ratios only
exp(coef(model))
## CIs using profiled log-likelihood
confint(model)
## CIs using standard errors
confint.default(model)

wald.test(b = coef(model), Sigma = vcov(model), Terms = 2)

#########################################################
Fit with logical vertor of a two-level factor
#########################################################
hive -e 'SELECT freq1d, count(*), count(distinct user_id), sum(click), sum(action) from tmp5 group by freq1d order by freq1d;' > /homes/result1d.csv;

data <- read.table("result1d.csv", head =F);

names(data) <- c("freq", "imp", "uuser", "click", "action");

data$CTR <- data$click/data$imp;

data$CVR <- data$action/data$imp;

data$freq[data$action==6]

data1 <- data[1:49,];

data1$nonaction <- data1$imp-data1$action;

data1$freq1d <- as.factor(data1$freq);

lrfit <- glm(cbind(data1$action, data1$nonaction) ~ + data1$freq1d , family = binomial, data=data1);

summary(lrfit);

#########################################################

Fit downsampling data

#########################################################

train <- fread("train.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));

test <- fread("test.csv", head=F, sep=',', colClasses=c("character", "character", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer", "integer"));

setnames(train, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));

setnames(test, c("rowid", "userid", "impdate", "imphour", "click", "action", "freq1d", "freq3d", "freq5d", "freq1w", "freq2w", "freq3w", "rect"));

train1 <- (train[ ,6:13, with=F]);

test1 <- (test[,6:13, with=F]);

train1$action <- factor(train1$action);

test1$action <- factor(test1$action);

# simple model;

lrfit <- glm(train1$action ~ ., family = binomial, data=train1);

summary(lrfit)

test1$score<-predict(lrfit,type='response',test1)

#calculate auc;

pred<-prediction(test1$score,test1$action);

auc.tmp <- performance(pred,"auc");

auc <- as.numeric(auc.tmp@y.values);

auc;

perf <- performance(pred,"tpr","fpr");

plot(perf);

# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.

max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

# top 3 variables

#works by sorting coefficient terms in equation and selecting top 3 in sort for each loan scored

g<-predict(lrfit,type='terms',test1)

ftopk<- function(x,top=3){

res=names(x)[order(x, decreasing = TRUE)][1:top]

paste(res,collapse=";",sep="")

}

topk=apply(g,1,ftopk,top=3)

test1<-cbind(test1, topk)

# add rect interaction;

lrfit <- glm(train1$action ~ freq1d+freq3d+freq5d+freq1w+freq2w+freq3w+rect+freq1d:rect+freq3d:rect+freq5d:rect+freq1w:rect+freq2w:rect+freq3w:rect, family = binomial, data=train1);

summary(lrfit)

test1$score<-predict(lrfit,type='response',test1)

#calculate auc;

pred<-prediction(test1$score,test1$action);

auc.tmp <- performance(pred,"auc");

auc <- as.numeric(auc.tmp@y.values); auc;

perf <- performance(pred,"tpr","fpr");

plot(perf);

# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.

max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

Tuesday, March 11, 2014

Basic Markov model in R

library("msm")
result4$user_cumcnt <- as.integer(result4$user_cumcnt)
result4$user_cnt <- as.integer(result4$user_cnt)
basic <- data.frame(result4[result4$user_cnt<=60, c(1,5,15,18)])

raw <- statetable.msm(category, userid, data = basic)
write.csv(raw, file= "raw_state.csv")

tmp <- melt(raw)
tmp$to <- as.integer(tmp$to)

tmp2<-cbind(1:23,levels(basic$category))
write.csv(tmp, "data.csv")
write.csv(tmp2, "junk.csv")

#add censor state;
tmp1 <- ddply(basic, .(userid), function(x) data.frame(user_cumcnt = max(x$user_cumcnt)+1))
tmp2 <- unique(basic[,c(1,4)])
tmp3 <- merge(tmp1,tmp2, by = "userid")
tmp4 <- data.frame(tmp3, rep("End", length(tmp1[,1])))
names(tmp4)[4] <- "category"
tmp5 <- data.frame(rbind(basic, tmp4))
basic <- tmp5[order(tmp5$userid, tmp5$user_cumcnt), ]

raw <- ifelse(raw>0, .09, 0)
diag(raw) <- .8
raw <- cbind(raw, rep(0,23))
raw <- rbind(raw, rep(0,24))
dimnames(raw)[[1]][24] <- "24"
dimnames(raw)[[2]][24] <- "End"
twoway4.q <- raw
twoway4.q <- matrix(rep(1, 24*24), nrow=24)
diag(twoway4.q) <- 0
twoway4.q[,24] <- 1
twoway4.q[24,] <- 0

basic$category <- as.integer(basic$category)

Sys.time()
visits.msm <- msm(category ~ user_cumcnt, subject = userid, data = basic, qmatrix = twoway4.q, death = 24)
Sys.time()

Thursday, March 6, 2014

data.table in R

require(plyr)
require(data.table)

# Making a data.table from vectors
dt.data <- data.table(a=1:5,b=1:5)

rm(list=ls(all=T))
setwd("DSP data");
data <- fread("out1000.csv", colClasses=c("character", "integer", "integer", "integer", "integer") );
setnames(data, c("user_id", "date", "hour", "click", "action"));

# regular expression
regexp <- "([[:digit:]]+)"
month <- as.integer(str_extract(data$date, regexp))

# automatically sort by key
setkey(data, user_id, date, hour)

# add cols
data[,id:=1:length(data$user_id)];
data[,bcook:=as.factor(data$user_id)];
hflights_dt <- data.table(hflights)
hflights_dt[, DistanceKMs := Distance / 0.62137]
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
hflights_dt[, c('DistanceKMs', 'DistanceFeets') := list(Distance/0.62137, Distance * 5280)];

# filter rows
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
rows = sample(1:n,n*.0001);
results[,rowid:=1:n];
samples=results[results$rowid %in% rows];
test <- data[user_id == users[1],];

# filter cols
test <- data[,c(2,3,4,5,6), with=FALSE]; #filter cols
train3 <- train3[,ScoreLevel:=NULL] #drop cols

# Basic Stat
system.time(
stats <- ddply(data, ~ date,
                 summarize, imp=length(imp), click=sum(click), action=sum(action))
)
system.time(
stats <- data[,list(imps=.N,
                    clicks=sum(click),
                    actions=sum(action)), by='date'];
)

#Drop outliers
data2 <- read.csv(file="data_extracted_filtered.txt",head=TRUE,sep=",",row.names=NULL,as.is=TRUE);
for (i in c(2:7)){
print('----------------------------------------------------------');
print(i);
data2 <- data2[data2[,i]<=quantile[i-1],]
}
quantile = rep(0, 6);
for (i in c(2:7)){
print('----------------------------------------------------------');
print(i);
quantile[i-1] <- max(data2[,i], .95)
}

# Apply same function to all cols
func1 <- function(x){
return(log(x+1)/log(max(x)))
}
data3 <- data.table(data2)
data3 <- data3[,id:=NULL]
data3[, 1:6 := lapply(.SD, func1), .SDcols = 1:6]

# Basic Merge
keycols= c("start_dt","variable")
setkeyv(top10thisyearlong, keycols)
setkeyv(top10lastyearlong, keycols)
top10all = merge(top10thisyearlong, top10lastyearlong, by=c("start_dt", "variable"))

# compute freq1, and add a new column with name of 'freq1' to the original table
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
setkey(data,bcook);
f_freq <- function(dd) {
setkey(dd,bcook);
user.merge <- merge(data, dd, by=c('bcook'));
user.merge <- user.merge[id.x>id.y, ];
user.merge$gap <- (user.merge$date.x - user.merge$date.y)*24 + (user.merge$hour.x - user.merge$hour.y);
freq<- user.merge[, list(freq1d=sum(gap<24)), by=id.x];
return(c(0, freq$freq1d));
}

sink("myfile", append=FALSE, split=FALSE);
train1[HasClick==-1, HasClick:=0];
idx <- names(train1)
for (i in c(1:36)){
print('----------------------------------------------------------');
print(i);
print(train1[,list(Impressions=.N, Clicks=sum(HasClick), CTR=sum(HasClick)/.N), by=eval(idx[i])][order(-CTR)]);
}
# return output to the terminal
sink()

## Merge multiple data frames
list.of.data.frames = list(dat1, dat2, dat3, dat4, dat5, dat6, dat7)
merged.data.frame = Reduce(function(...) merge(..., all=T), list.of.data.frames)
tail(merged.data.frame)