Wednesday, July 10, 2013

Check User Retention and User Churn Using R


rm(list=ls(all=TRUE))
sessionInfo()

require(data.table)


#########################################################
## Active Customers
#########################################################
user_act=fread("~/Desktop/Jobs/Tumblr/user_act.csv", header = T)
str(user_act)
user_act$datetime = as.POSIXct(as.numeric(as.character(user_act$ts)),origin="1970-01-01",tz="UTC")
user_act$date= format(user_act$datetime, format = "%m/%d/%Y")
user_act$time=format(user_act$datetime, format = "%H %p")
user_act$label = paste(user_act$date, user_act$time, sep="-")
dim(user_act)

data <- ddply(user_act, .(date, label), summarise, tot=length(user_id))
summary(data$tot)
ggplot(data, aes(y=tot, x=label, fill=date)) +
  geom_bar(stat="identity") +
  ggtitle("Active Users by Time") +
  geom_text(aes(label=data$tot), hjust=-.05, size=4, fontface = "bold") +
  geom_hline(yintercept=median(data$tot), color="blue", linetype = 2) +
  ylab("Hourly Active Users") +
  xlab("Time") +
  coord_flip() +
  theme(plot.title = element_text(face = "bold", size = 20)) +
  theme(axis.text.x = element_text(face = "bold", size = 14)) +
  theme(axis.text.y = element_text(face = "bold", size = 14)) +
  theme(axis.title.x = element_text(face = "bold", size = 16)) +
  theme(strip.text.x = element_text(face = "bold", size = 16)) +
  theme(axis.title.y = element_text(face = "bold", size = 16, angle=90)) +
  guides(fill=FALSE)


#########################################################
## Register Customers
#########################################################
user_regi=fread("~/Desktop/Jobs/Tumblr/user_regi.csv", header = T)
str(user_regi)
user_regi$datetime = as.POSIXct(as.numeric(as.character(user_regi$regi_ts)),origin="1970-01-01",tz="GMT")
user_regi$date= format(user_regi$datetime, format = "%m/%d/%Y")
user_regi$time=format(user_regi$datetime, format = "%H:%M")
dim(user_regi)


Customer retention refers to the ability of a company or product to retain its customers over some specified period. High customer retention means customers of the product or business tend to return to, continue to buy or in some other way not defect to another product or business, or to non-use entirely.

dt <- c(20130409:20130430, 20130501:20130531, 20130601:20130607)
rate <- c()
for (jj in 0:30){
  cat("\n----------------- gap = ", jj, "--------------------\n");
  data <- c()
  for (ii in 1:30) {
    cnt1 <- unlist(sqlQuery(con, paste("select count(*) from
                                     (select distinct userhashkey from
                                     (select userhashkey from vdw.dev.lh_tmp where datekey = ", dt[ii], ") a) b", sep ='')));     

    cnt2 <- unlist(sqlQuery(con, paste("select count(*) from
                                   (select distinct a.userhashkey from
                                   (select userhashkey from dev.tmp where datekey = ", dt[ii], ") a
                                   join (select userhashkey from dev.tmp where datekey > ", dt[ii + jj], ") b using(userhashkey)) a", sep = '')));
    data <- c(data, cnt2 / cnt1)
    cat(ii, cnt2 / cnt1, ' ', cnt2, ' ', cnt1, '\n')
  }
  rate <- c(rate, max(data))
  cat(jj, max(data))
}

data <- data.frame(1:length(rate)-1, c(1, rate[1:30]));
names(data) <- c("days", "sr")
write.csv(data, "sr3.csv")


#########################################################
## User Retention Modeling
#########################################################
head(all)
names(all)

n=nrow(all)
ntrain <- round(n*0.7)
set.seed(333)
tindex <- sample(n, ntrain)
all$y <- all$y==1
train <- all[tindex, ]
test <- all[-tindex, ]

summary(cbind(all$pageviews1, all$follows1, all$likes1, all$reblogs1, all$original_posts1, all$searches1, all$unfollows1, all$received_engagments1))
summary(cbind(train$pageviews1, train$follows1, train$likes1, train$reblogs1, train$original_posts1, train$searches1, train$unfollows1, train$received_engagments1))
summary(cbind(test$pageviews1, test$follows1, test$likes1, test$reblogs1, test$original_posts1, test$searches1, test$unfollows1, test$received_engagments1))


# "y"
# "is_verified"         "pageviews1"           "follows1"             "likes1"            
# "reblogs1"             "original_posts1"  
# "searches1"            "unfollows1"           "received_engagments1"


#########################################################
## Modeling -- KNN: only for continuous values;
#########################################################
train_x <- data.frame(is_verified=as.numeric(train$is_verified),
                      pageviews1=as.numeric(train$pageviews1),
                      follows1=as.numeric(train$follows1),
                      likes1=as.numeric(train$likes1),
                      reblogs1=as.numeric(train$reblogs1),
                      original_posts1=as.numeric(train$original_posts1),
                      searches1=as.numeric(train$searches1),
                      unfollows1=as.numeric(train$unfollows1),
                      received_engagments1=as.numeric(train$received_engagments1),
                      time1=as.numeric(train$time1),
                      regi_geo1=as.numeric(train$regi_geo1))
train_y=train$y
test_x <- data.frame(is_verified=as.numeric(test$is_verified),
                     pageviews1=as.numeric(test$pageviews1),
                     follows1=as.numeric(test$follows1),
                     likes1=as.numeric(test$likes1),
                     reblogs1=as.numeric(test$reblogs1),
                     original_posts1=as.numeric(test$original_posts1),
                     searches1=as.numeric(test$searches1),
                     unfollows1=as.numeric(test$unfollows1),
                     received_engagments1=as.numeric(test$received_engagments1),
                     time1=as.numeric(test$time1),
                     regi_geo1=as.numeric(test$regi_geo1))

tmp <- knn(train_x, test_x, train_y, k=5)
test$score=factor(tmp, levels=c(FALSE, TRUE))
#calculate auc;
AUC(test$score, test$y)
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
KS(test$score, test$y)

ggplot(test, aes(pageviews1, follows1)) +
  geom_point(aes(color=test$y))


#########################################################
## Modeling --  Support Vector Machines
#########################################################
## 0.6148956 0.2297911
svm <- svm(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
           + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train,
           method = "C-classification", kernel = "radial", cost = 100, gamma = 1)
test$score <- predict(svm, test)
AUC(test$score, test$y)
KS(test$score, test$y)

## 0.621205 0.2424101
svm <- svm(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
           + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train,
           method = "C-classification", kernel = "radial", cost = 1, gamma = 1)
test$score <- predict(svm, test)
AUC(test$score, test$y)
KS(test$score, test$y)


#########################################################
## Modeling -- Classification Trees
#########################################################
# str(train)
# dtre1 <- tree(as.factor(y)~is_verified + pageviews1 + follows1 + likes1
#               + reblogs1 + original_posts1 + searches1 + unfollows1 + received_engagments1, data=train)
# plot(dtre1)
# text(dtre1)
# summary(dtre1)
# test$score <- predict(dtre1, test, type='class')
# prop.table(table(test$score, test$y))
ct <- rpart(as.factor(y) ~ is_verified + pageviews1 + follows1 + likes1
            + reblogs1 + original_posts1 + searches1 + unfollows1 + received_engagments1
            + time1 + regi_geo1,
            data = train, minsplit = 5)
summary(ct)
plot(ct)
text(ct)

test$score = predict(ct, newdata = test, type = "prob")[,2]
#calculate auc;
AUC(test$score, test$y)
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
KS(test$score, test$y)


#########################################################
## Modeling -- Naive Bayes
#########################################################
train$y=factor(train$y, levels=c(TRUE, FALSE))
newdata <- data.frame(y=test$y,
                      is_verified=test$is_verified,
                      pageviews1=test$pageviews1,
                      follows1=test$follows1,
                      likes1=test$likes1,
                      reblogs1=test$reblogs1,
                      original_posts1=test$original_posts1,
                      searches1=test$searches1,
                      unfollows1=test$unfollows1,
                      received_engagments1=test$received_engagments1,
                      time1=test$time1,
                      regi_geo1=test$regi_geo1)
# nb1 <- NaiveBayes(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
#                   + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train)
# tmp <- predict(nb1, newdata = newdata)
# table(tmp$class, test$y)
# test$score = tmp$posterior
# #calculate auc;
# pred = prediction(test$score, test$y);
# auc.tmp <- performance(pred,"auc");
# auc <- as.numeric(auc.tmp@y.values); auc;
# perf <- performance(pred,"tpr","fpr");
# plot(perf);
# abline(0, 1, lty = 2);
# # KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
# max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])
nb1 <- NaiveBayes(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
                  + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train,
                  kernel = "gaussian", userkernel=TRUE)
summary(nb1)
pred <- predict(nb1, newdata = newdata)
table(pred$class, test$y)
test$score = pred$posterior[,1]
#calculate auc;
AUC(test$score, test$y)
KS(test$score, test$y)


#########################################################
## Modeling -- Logistic Regression
#########################################################
## Logistic Regression Model
glm1 <- glm(y ~ is_verified + pageviews1 + follows1 + likes1, data=train, family=binomial)
summary(glm1)

glm2 <- glm(y ~ is_verified + pageviews1 + follows1 + likes1
            + reblogs1 + original_posts1 + searches1 + unfollows1 + received_engagments1
            + regi_geo1 + time1, data=train, family=binomial)
summary(glm2)
test$score<-predict(glm2,type='response',test)
#calculate auc;
AUC(test$score, test$y)
# pred<-prediction(test$score,test$y);
# auc.tmp <- performance(pred,"auc");
# auc <- as.numeric(auc.tmp@y.values); auc;
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
# max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])
KS(test$score, test$y)

slm1 <- step(glm2)
summary(slm1)
slm1$anova


#########################################################
## Modeling -- Neural Networks
#########################################################
train$y=as.integer(train$y)
train$y1=1-train$y
train$is_verified=as.integer(train$is_verified)
train$time1=as.integer(train$time1)
train$regi_geo1=as.integer(train$regi_geo1)

nn1 <- neuralnet(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
                 + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1,
                 data=train, hidden=c(4))
plot(nn1)

test_x <- data.frame(is_verified=as.integer(test$is_verified),
                     pageviews1=as.integer(test$pageviews1),
                     follows1=as.integer(test$follows1),
                     likes1=as.integer(test$likes1),
                     reblogs1=as.integer(test$reblogs1),
                     original_posts1=as.integer(test$original_posts1),
                     searches1=as.integer(test$searches1),
                     unfollows1=as.integer(test$unfollows1),
                     received_engagments1=as.integer(test$received_engagments1),
                     time1=as.integer(test$time1),
                     regi_geo1=as.integer(test$regi_geo1))
prediction <- compute(nn1, test_x)
prediction <- prediction$net.result

test$score <- prediction[,1]
AUC(test$score, test$y)
KS(test$score, test$y)


#########################################################
## Modeling -- Random Forest
#########################################################
rf <- randomForest(factor(y)~is_verified + pageviews1 + follows1 + likes1
                   + reblogs1 + original_posts1 + searches1 + unfollows1 + received_engagments1
                   + time1 + regi_geo1, data=train, ntree=500, mtry=2, importance=TRUE)
plot(rf)
pred <- predict(rf, newdata = test, type = 'class')
prop.table(table(pred, test$y))

rf <- randomForest(factor(gear) ~ ., data = train, ntree = 100, importance=T)
varImpPlot(rf, main="Importance of Variables")  

test$score <- predict(rf, newdata=test, type='class')
table(test$score, test$y)
AUC(test$score, test$y)
KS(test$score, test$y)

No comments:

Post a Comment