Thursday, December 13, 2012

SVM2 - Using LIBLINEAR SVM in R

memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))

require(RODBC)
require(ggplot2)
require(plyr)
require(e1071)
require(LiblineaR)

data(iris)
attach(iris)
x=iris[,1:4]
y=factor(iris[,5])
train=sample(1:dim(iris)[1],100)
xTrain=x[train,]
xTest=x[-train,]
yTrain=y[train]
yTest=y[-train]

# Center and scale data
s=scale(xTrain,center=TRUE,scale=TRUE)
# Sparse Logistic Regression
t=6
# Tune the cost parameter of a logistic regression according to the Joachim’s heuristics
co=heuristicC(s)
m=LiblineaR(data=s,labels=yTrain,type=t,cost=co,bias=TRUE,verbose=FALSE)
# Scale the test data
s2=scale(xTest,attr(s,"scaled:center"),attr(s,"scaled:scale"))
# Make prediction
p=predict(m,s2)
# Display confusion matrix
res=table(p$predictions,yTest)
print(res)

# Compute Balanced Classification Rate
BCR=mean(c(res[1,1]/sum(res[,1]),res[2,2]/sum(res[,2]),res[3,3]/sum(res[,3])))
print(BCR)

# Logistic Regression
t=0
# Find the best model with the best cost parameter via 10-fold cross-validations
# 0 – L2-regularized logistic regression
# 1 – L2-regularized L2-loss support vector classification (dual)
# 2 – L2-regularized L2-loss support vector classification (primal)
# 3 – L2-regularized L1-loss support vector classification (dual)
# 4 – multi-class support vector classification by Crammer and Singer
# 5 – L1-regularized L2-loss support vector classification
# 6 – L1-regularized logistic regression
# 7 – L2-regularized logistic regression (dual)
tryTypes=c(0:7)
tryCosts=c(1000,100,10,1,0.1,0.01,0.001)
bestCost=NA
bestAcc=0
bestType=NA
for(ty in tryTypes){
  for(co in tryCosts){
    acc=LiblineaR(data=s,labels=yTrain,type=ty,cost=co,bias=TRUE,cross=10,verbose=FALSE)
    cat("Results for C=",co," : ",acc," accuracy.\n",sep="")
    if(acc>bestAcc){
      bestCost=co
      bestAcc=acc
      bestType=ty
    }
  }
}
cat("Best model type is:",bestType,"\n")
cat("Best cost is:",bestCost,"\n")
cat("Best accuracy is:",bestAcc,"\n")

# Re-train best model with best cost value.
m=LiblineaR(data=s,labels=yTrain,type=bestType,cost=bestCost,bias=TRUE,verbose=FALSE)

# Scale the test data
s2=scale(xTest,attr(s,"scaled:center"),attr(s,"scaled:scale"))
# Make prediction
pr=FALSE
if(bestType==0 | bestType==7) pr=TRUE
p=predict(m,s2,proba=pr,decisionValues=TRUE)
# Display confusion matrix
res=table(p$predictions,yTest)
print(res)
# Compute Balanced Classification Rate
BCR=mean(c(res[1,1]/sum(res[,1]),res[2,2]/sum(res[,2]),res[3,3]/sum(res[,3])))
print(BCR)

#coding to use loops;
cost <- c(100, 10, 1, .1, .01);
ty <- 3
kk <- 1
s <- scale(x[,-1],center=TRUE,scale=TRUE)
#L1-regularized L2-loss support vector classification
fit <- LiblineaR(data=s,labels=x[,1],type=ty,cost=cost[kk], cross=10,bias=TRUE,verbose=FALSE)
pred <- predict(fit, s)
result <- table(pred$predictions, x$conv);
cat("\n");
cat(kk, cost[kk], result[1:4], "\n");
err <- sum(result[2:3]);
co <- cost[kk];

for(kk in 2 : length(cost)){
  fit <- LiblineaR(data=s,labels=x[,1],type=ty,cost=cost[kk],bias=TRUE,verbose=FALSE)
  pred <- predict(fit, s)
  result <- table(pred$predictions, x$conv);
  cat(kk, cost[kk], result[1:4], "\n")
  if (sum(result[2:3]) <= err){
    co <- cost[kk]
    err <- sum(result[2:3])
  }
}

# Try binary
iris.part = subset(iris, Species != 'setosa')
iris.part$Species = factor(iris.part$Species)
iris.part = iris.part[, c(1,2,5)]
head(iris.part)

xTrain = iris.part[, 1:2]
yTrain = iris.part[, 3]
s=scale(xTrain,center=TRUE,scale=TRUE)
# Sparse Logistic Regression
t=3
# Tune the cost parameter of a logistic regression according to the Joachim’s heuristics
co=heuristicC(s)
m=LiblineaR(data=s,labels=yTrain,type=t,cost=co,bias=TRUE,verbose=FALSE)
# Scale the test data
s2=scale(xTrain,attr(s,"scaled:center"),attr(s,"scaled:scale"))
# Make prediction
p=predict(m,s2, decisionValues = T)
# Display confusion matrix
res=table(p$predictions,yTrain)
print(res)

# Calculate parameters manually;
w <- m$W
iris.scaled = as.matrix(data.frame(s2, rep(1, length(s2[,1]))))
rho <-iris.scaled %*% t(w)
# should equal
rho - p$decisionValues[,1]

Thursday, November 29, 2012

SVM 1 - Using LIBSVM in R

memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))

require(RODBC)
require(ggplot2)
require(plyr)
require(e1071)

# http://planatscher.net/svmtut/svmtut.html
# 1) Task: linear separable data
data1 <- seq(1,10,by=2)
classes1 <- c('a','a','a','b','b')
test1 <- seq(1,10,by=2) + 1
model1 <- svm(data1,classes1,type='C',kernel='linear')
print(model1)
summary(model1)
predict(model1,data1)
pred <- fitted(model1)
table(pred, classes1)
predict(model1,test1)

#2) Task: not-linear-separable data
data2 <- seq(1,10)
classes2 <- c('b','b','b','a','a','a','a','b','b','b')
model2 <- svm(data2,classes2,type='C',kernel='linear')
predict(model2,data2)
table(predict(model2,data2), classes2)

#3) Task: Example from the manual (iris-data)
attach(iris)
model <- svm(Species ~ ., data = iris)
x <- subset(iris, select = -Species)
y <- Species
model <- svm(x, y)
print(model)
summary(model)
pred <- predict(model, x)
pred <- fitted(model)
table(pred, y)

pred <- predict(model, x, decision.values = TRUE)
attr(pred, "decision.values")[1:4,]
plot(cmdscale(dist(iris[,-5])),col = as.integer(iris[,5]),pch = c("o","+")[1:150 %in% model$index + 1])

i2 <- iris
levels(i2$Species)[3] <- "versicolor"
summary(i2$Species)
wts <- 100 / table(i2$Species)
wts
m <- svm(Species ~ ., data = i2, class.weights = wts)
detach(iris)

x <- seq(0.1, 5, by = 0.05)
y <- log(x) + rnorm(x, sd = 0.2)
m <- svm(x, y)
new <- predict(m, x)

plot(x, y)
points(x, log(x), col = 2)
points(x, new, col = 4)
X <- data.frame(a = rnorm(1000), b = rnorm(1000))
attach(X)

m <- svm(X, gamma = 0.1)
m <- svm(~., data = X, gamma = 0.1)
newdata <- data.frame(a = c(0, 4), b = c(0, 4))
predict (m, newdata)

plot(X, col = 1:1000 %in% m$index + 1, xlim = c(-5,5), ylim=c(-5,5))
points(newdata, pch = "+", col = 2, cex = 5)

#4) Task: Breast-cancer data
bcdata <- read.csv('SVM\\breast-cancer-wisconsin.data',head=TRUE)
names(bcdata)
databcall <- subset(bcdata,select=c(-Samplecodenumber,-Class))
classesbcall <- subset(bcdata,select=Class)
databctrain <- databcall[1:400,]
classesbctrain <- classesbcall[1:400,]
databctest <- databcall[401:699,]
classesbctest <- classesbcall[401:699,]
model <- svm(databctrain, classesbctrain)
pred <- predict(model, databctest)
table(pred,t(classesbctest))
tune(svm, train.x=databctrain, train.y=classesbctrain, validation.x=databctest, validation.y=classesbctest, ranges = list(gamma = 2^(-1:1), cost = 2^(2:4)), control = tune.control(sampling = "fix"))


# Try binary
iris.part = subset(iris, Species != 'setosa')
iris.part$Species = factor(iris.part$Species)
iris.part = iris.part[, c(1,2,5)]
head(iris.part)

# Fit svm model
fit = svm(Species ~ ., data=iris.part, type='C-classification', kernel='linear')
summary(fit)

# Make a plot of the model
dev.new(width=5, height=5)
plot(fit, iris.part)

# Tabulate actual labels vs. fitted labels
pred = predict(fit, iris.part)
table(Actual=iris.part$Species, Fitted=pred)

# Obtain feature weights
w = t(t(fit$coefs) %*% fit$SV)
coef = data.frame(dimnames(w)[[1]],w)

# Calculate decision values manually
iris.scaled = scale(iris.part[,-3], fit$x.scale[[1]], fit$x.scale[[2]])
t(w) %*% t(as.matrix(iris.scaled)) - fit$rho

tmp1 <- (iris.part[,1] - fit$x.scale[[1]][1])/fit$x.scale[[2]][1];
tmp2 <- (iris.part[,2] - fit$x.scale[[1]][2])/fit$x.scale[[2]][2];
tmp <- data.frame(tmp1, tmp2);
scores <- t(w) %*% t(as.matrix(tmp)) - fit$rho

# Should equal...
fit$decision.values

# tune parameters;
fit = svm(Species ~ ., data=iris.part, type='C-classification', kernel='linear')
obj <- tune(svm, Species~., data=iris.part, tunecontrol = tune.control(sampling = "fix"))
summary(obj)
plot(obj, iris.part)
obj = best.tune(svm, Species ~ ., data=iris.part, type='C-classification', kernel = 'linear')

# Journal of Stat software;
data(iris)
n = length(iris[,1])
msk = sample(1:n, n*.8)
iris_train = iris[msk,]
iris_test = iris[-msk,]

model <- svm(Species ~ ., data = iris_train, method = "C-classification", kernel = "radial", cost = 10, gamma = 0.1)
summary(model)
plot(model, iris_train, Petal.Width ~Petal.Length, slice = list(Sepal.Width = 3, Sepal.Length = 4))
pred <- predict(model, head(iris), decision.values = TRUE)
attr(pred, "decision.values")
prediction <- predict(model, iris_test)
tab <- table(pred = prediction, true = iris_test[,5]); tab

#tune framework;
tuned <- tune.svm(Species ~ ., data = iris_train, gamma = 10^(-6:-1), cost = 10^(-1:1))
summary(tuned)
model  <- svm(Species ~ ., data = iris_train, kernel="radial", gamma=0.01, cost=10)
summary(model)
plot(model, iris_train, Petal.Width ~Petal.Length, slice = list(Sepal.Width = 3, Sepal.Length = 4))
prediction <- predict(model, iris_test)
tab <- table(pred = prediction, true = iris_test[,5]); tab

#tune binary classifier;
iris.part = subset(iris, Species != 'setosa')
iris.part$Species = factor(iris.part$Species)
iris.part = iris.part[, c(1,2,5)]
head(iris.part)

n = length(iris.part[,1])
msk = sample(1:n, n*.8)
iris_train = iris.part[msk,]
iris_test = iris.part[-msk,]
tobj <- tune.svm(Species ~ ., data = iris_train, gamma = 10^(-6:-3), cost = 10^(-1:1))
summary(tobj)

plot(tobj, transform.x = log10, xlab = expression(log[10](gamma)), ylab = "C")
bestGamma <- tobj$best.parameters[[1]]
bestC <- tobj$best.parameters[[2]]
model <- svm(Species ~ ., data = iris_train, cost = bestC, gamma = bestGamma, cross = 10)
summary(model)

pred <- predict(model, iris_test)
(acc <- table(pred, iris_test$Species))
classAgreement(acc)

Tuesday, November 6, 2012

Generate Mulitpage pdf

fnames <- c('filename')

#data contains two objests(icr, icr.new)
multipage <- function(fnames, result.table, outputfnames){
  pdf(outputfnames)
  for (i in 1:length(fnames)){
#    for (i in 1:10){
    data <- load(file = paste(fnames[i], ".RData", sep = ""))
    if (data[1] != "icr") { icr <- rate1;
                            icr.new <- rate9;}
 
    icr$rank <- icr$rank/1000000
    icr.new$rank <- icr.new$rank/1000000
    rate10$rank <- rate10$rank/1000000;
 
    points <- result.table[result.table$campaign == fnames[i],3:6]
    points$size <- points$size/1000000
    p <- ggplot(data=icr, aes(y=rank.1, x=rank)) +
      scale_x_continuous(limits=c(0,25)) +
      geom_line(aes(x=rank, y=rank.1),size = 1.5, color = "black") +
      geom_line(data=icr.new, aes(x=rank, y=rank.1),size = 1.5, color = "red") +
      geom_line(data=rate10, aes(x=rank, y=rank.1),size = 1.5, color = "darkgreen") +
      geom_point(data=points, aes(x=size, y=pred.lift), size = 3, color = "blue") +
      geom_point(data=points, aes(x=size, y=nb.lift), size = 3, color = "purple") +
      geom_point(data=points, aes(x=size, y=em.lift), size = 3, color = "green") +
      xlab("Audience Size (X 1,000,000)") + ylab("Est. Lift (%)") +
      opts(axis.text.x = theme_text(face = "bold", size = 12)) +
      opts(axis.text.y = theme_text(face = "bold", size = 12)) +
      opts(axis.title.x  = theme_text(face = "bold", size = 12)) +
      opts(axis.title.y  = theme_text(face = "bold", size = 12, angle = 90)) +
      opts(title = paste("Lift Plot of ", fnames[i],sep="")) +
      opts(plot.title = theme_text(face = "bold", size=14))
 
    print(p)
  }
  dev.off()
}

multipage (fnames, result.table, "results.pdf")

Thursday, September 13, 2012

Compute AUC and KS Statistics

An ROC curve is used to represent the relationship of the true-positive rate(TPR) to the false-positive rate(FNR) of a test as the cutoff score varies. 

Changing the cutoff score changes the proportion oft he two types of error. For a response where a higher score indicates a higher likelihood to respond, choosing a high score as the cutoff means fewer false positive (people labeled as responders who do not respond) and more false negatives (people labeled as non-responders who would respond). 

Sensitivity is the proportion of true positives among people who get a positive result on a test (TPR). Sensitivity measures the likelihood that a diagnosis based on the test is correct. Recall measures the number of articles on the correct topic returned by a Web search or other text query. Recall is the same as TPR. 

Specificity is the proportion of true negatives among people who get a negative result on the test(TNR). 

Precision (TP/(TP+FP)) measures the percentage of the returned articles that are on the correct topic. 

The maximum benefit is proportional to the maximum distance between the cumulative distribution functions of the probabilities in each class. 


What this means is that the model score that cuts the prospect list at the penetration where the benefit is greatest is also the score that maximizes the Kolmogorov-Smirnov (KS) statistics. The KS test was developed as a test of whether two distributions are different. Splitting the list as the point of maximum benefit results in a 'good list' and a 'bad list' whose distributions of responders are maximally separate from each other and from the population. In this case, the 'good list' has a maximum proportion of responders and 'bad list' has a minimum proportion. 

#########################################################
## R Calculation
#########################################################
install.packages('verification')
library('verification')
library('zoo')

# Data used from Mason and Graham (2002).
a<- c(1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
      1991, 1992, 1993, 1994, 1995)
b<- c(0,0,0,1,1,1,0,1,1,0,0,0,0,1,1)
c<- c(.8, .8, 0, 1,1,.6, .4, .8, 0, 0, .2, 0, 0, 1,1)
d<- c(.928,.576, .008, .944, .832, .816, .136, .584, .032, .016, .28, .024, 0, .984, .952)

A<- data.frame(a,b,c,d)
names(A)<- c("year", "event", "p1", "p2")

## for model with ties
roc.area(A$event, A$p1)
##0.8392857

## for model without ties
roc.area(A$event, A$p2)
##0.875

ROC<- function(dat, score_col,class_col){
  as <- dat[,score_col];
  las <- length(as);
  aslab <- dat[, class_col];
  numpos <- sum(aslab == 1);
  numneg <- sum(aslab == 0);
  tmp <- sort.int(as, index.return = T);
  sas <- tmp$x;
  saslab <- aslab[tmp$ix];
 
  selpn <- saslab %in% c(1, 0);

  saspn <- sas[selpn];
  lsaspn <- length(saspn);
  saslabpn <- saslab[selpn];
  tmp <- sort.int(saspn, decreasing = T, index.return = T);
  saspnrev <- tmp$x;
  saslabpnnrev <- saslab[tmp$ix];
  cumsaslabpnnrev <- cumsum(saslabpnnrev);
 
  end <- length(saspnrev);
  difsaspnrev <- saspnrev[2:end] - saspnrev[1:(end-1)];
  difsaspnrev <- c(0, difsaspnrev);
  steps <- (difsaspnrev != 0) ;
  tmp <- 1:length(steps);
  steplocs <- c(1, tmp[steps], lsaspn+1);

  end <- length(steplocs);
  rl <- steplocs[2:end] - steplocs[1:(end-1)];
  csr <- cumsaslabpnnrev[steplocs[2:end]-1];
 
  end <- length(csr);
  npr <- c(csr[1], (csr[2:end] - csr[1:(end-1)]));
  nnr <- rl - npr
  npr <- c(0, npr);
  nnr <- c(0, nnr);
 
  fpr <- cumsum(nnr)/numneg;
  recall <- cumsum(npr)/numpos;
  roc <- data.frame(fpr, recall);
 
  a2 = sum(nnr * recall) / numneg;
  adjust = sum(nnr * npr) / (2*numpos*numneg);
  a3 = a2 - adjust;
  a1 = a3 - adjust;
  area =data.frame(a3, a1, a2);
 
  result <- list(area, roc);
  return(result);
}
dat <- A
score_col <- 3
class_col <-2
result <- ROC(A, 3,2);
result[[1]][1]
##0.8392857

result <- ROC(A, 4,2);
result[[1]][1]
##0.875

dat <- A
score_col <- 3
class_col <-2
AUC_yahoo <- function(dat, score_col,class_col){
  as <- dat[,score_col];
  las <- length(as);
  aslab <- dat[, class_col];
  numpos <- sum(aslab == 1);
  numneg <- sum(aslab == 0);
  possc <- as[aslab == 1];
  negsc <- as[aslab == 0];
  pgtn <- 0;
  peqn <- 0;
  for (inn in  1:numneg){
    for (ip in 1:numpos) {
      pgtn = pgtn + (possc[ip] > negsc[inn]);
      peqn = peqn + (possc[ip] == negsc[inn]);
    }
  }
  num = numpos * numneg;
  area1 = pgtn/num;
  area2 = (pgtn + peqn)/num;
  area3 = (pgtn + peqn/2)/num;
  area = data.frame(area3, area1, area2);
return(area);
}
dat <- A
score_col <- 3
class_col <-2
result <- AUC_yahoo(A, 3,2);
result[1]
##0.8392857

result <- AUC_yahoo(A, 4,2);
result[1]
##0.875

n<- length(A[,1])
conv <- length(A[A$event==1,1])
non_conv <- n-conv

#estimating ROC by p1;
A1 <- A[sort.list(A$p1, decreasing = T),]
A1$tp <- cumsum(A1$event)
A1$numberc <- 1:n
A1$fp <- A1$numberc - A1$tp
A1$percy <- A1$tp / conv
A1$percx <- A1$fp /non_conv
roc <- sum(diff(A1$percx[A1$numberc])*rollmean(A1$percy[A1$numberc],2)); roc
#0.8392857


#estimating ROC by p2;
A2 <- A[sort.list(A$p2, decreasing = T),]
A2$tp <- cumsum(A2$event)
A2$numberc <- 1:n
A2$fp <- A2$numberc - A2$tp
A2$percy <- A2$tp / conv
A2$percx <- A2$fp /non_conv
roc <- sum(diff(A2$percx[A2$numberc])*rollmean(A2$percy[A2$numberc],2)); roc
#0.875
icr <- rate1;
icr.new <- rate9;

icr$percx <- (icr$rank.1-icr$numberc)/ (max(icr$rank.1)-max(icr$numberc))
icr$percy <- icr$numberc/max(icr$numberc);

icr.new$percx <- (icr.new$rank.1-icr.new$numberc)/ (max(icr.new$rank.1)-max(icr.new$numberc))
icr.new$percy <- icr.new$numberc/max(icr.new$numberc);

ggplot(data=icr, aes(y=percy, x=percx)) +
  scale_x_continuous(limits=c(0,1)) +
  geom_line(aes(x=percx, y=percy),size = 1.5, color = "black") +
  geom_line(data=icr.new, aes(x=percx, y=percy),size = 1.5, color = "red") +
  geom_line(aes(x=percx, y=percx),size = 1.5, color = "black", linetype="dashed") +
  ylab("True Positive Rate") + xlab("False Positive Rate") +
  opts(axis.text.x = theme_text(face = "bold", size = 12)) +
  opts(axis.text.y = theme_text(face = "bold", size = 12)) +
  opts(axis.title.x  = theme_text(face = "bold", size = 12)) +
  opts(axis.title.y  = theme_text(face = "bold", size = 12, angle = 90)) +
  opts(title = paste("ROC Chart")) +
  opts(plot.title = theme_text(face = "bold", size=14))

# 0.7548072
roc <- sum(diff(icr$percx[icr$numberc])*rollmean(icr$percy[icr$numberc],2)); roc
# 0.7455777
roc.new <- sum(diff(icr.new$percx[icr.new$numberc])*rollmean(icr.new$percy[icr.new$numberc],2)); roc.new

## use ROCR
require(ROCR)
AUC <- function(pred,depvar){
  require("ROCR")
  p   <- ROCR::prediction(as.numeric(pred),depvar)
  auc <- ROCR::performance(p,"auc")
  auc <- unlist(slot(auc, "y.values"))
 
  perf <- ROCR::performance(p,"tpr","fpr");
  plot(perf);
  abline(0, 1, lty = 2);
 
  return(auc)
}

KS <- function(pred,depvar){
  require("ROCR")
  p   <- ROCR::prediction(as.numeric(pred),depvar)
  perf <- ROCR::performance(p, "tpr", "fpr")
  ks <- max(attr(perf, "y.values")[[1]] - (attr(perf, "x.values")[[1]]))
  return(ks)
}

test$score=factor(tmp, levels=c(FALSE, TRUE))
#calculate auc;
AUC(test$score, test$y)
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
KS(test$score, test$y)


#########################################################
## Python Calculation
#########################################################
#!/usr/bin/python3
import os
                           
aslab=[-1,-1,-1,1,1,1,-1,1,1,-1,-1,-1,-1,1,1]
ascore=[.928,.576, .008, .944, .832, .816, .136, .584, .032, .016, .28, .024, 0, .984, .952]

las =len(aslab)
numpos = 0
numneg = 0
possc=[]
negsc=[]
for i in range(0,len(aslab)):
    if aslab[i]==1:
        possc.append(ascore[i])
        numpos+=1
    elif aslab[i]==-1:
        negsc.append(ascore[i])
        numneg+=1
         
pgtn = 0
peqn = 0
for inn in range(0,numneg):
    for ip in range(0,numpos):
        pgtn = pgtn + (possc[ip] > negsc[inn])
        peqn = peqn + (possc[ip] == negsc[inn])
num = numpos * numneg
#area1 = pgtn/num
#area2 = (pgtn + peqn)/num
AUC = (pgtn + peqn/2)/num

Saturday, July 7, 2012

Simpson's Paradox

Simpson's Paradox, which is when the estimate of an aggregate effect is misleading and markedly different than the effect seen in underlying categories. A famous example occurred in graduate admissions at the University of California at Berkeley, where an apparent bias in admission was due instead to the fact that different departments has different overall admission rates and numbers of applicants. 

Discrete Distribution and Continuous Distribution

Moments of Truth

https://www.paypal-engineering.com/2016/04/11/statistics-for-software/?utm_content=bufferd2f5d&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer



Discrete Distribution

Bernoulli distribution / Binomial distribution

Multinoulli distribution (categorical distribution) / Multinomial distribution

Poisson distribution

Uniform distribution

Continuous Distribution

Normal distribution
1 it has two parameters which are easy to interpret, and which capture some of the most basic properties of a distribution, namely its mean and variance.
2 the central limit theorem tells us that sums of independent random variables have an approximately normal distribution, making it a good choice for modeling residual erros or noise.
3 the normal distribution makes the least number of assumptions(has maximum entropy), subject to the constraint of having a specified mean and variance, This makes it a good default choice in may cases.
4 it has a simple mathematical form, which results in easy to implement, but often highly effective methods.

Student t distribution
Student t is more robust because it has heavier tails, at least for small v.
If v=1, this distribution is known as the Cauchy or Lorentz distribution. This is notable for having such heavy tails that the integral that defines the mean does not converge.
To ensure finite variance, require v>2. 
It is common to use v=4, which gives good performance in a range of problems. 
The smaller v is, the fatter the tails become. For v>>5, the Student distribution rapidly approaches a normal and loses its robustness properties.

Laplace distribution
It is robust to outliers, and also put mores probability density at o than the normal. It is a useful way to encourage sparsity in a model.
Note that the Student distribution is not log-concave for any parameter value, unlike the Laplace distribution, which is always log-concave(and log-convex...). Nevertheless, both are unimodela.

Laplace distribution is similar to the Gaussian distribution, but its first derivate is undefined at zero because it has a very sharp peak at zero (see the following figure). The Laplace distribution concentrates its probability mass much closer to zero compared to the normal distribution; the resulting effect when we use it as a prior is that it has the tendency to make some parameters zero. Thus it is said that lasso (or its Bayesian analog) can be used to regularize and also to perform variable selection (removing some terms or variables from a model).

Gamma distribution
1 a is the shape parameter, b is the rate parameter.
2 if a<=1, the mode is at 0, otherwise mode is > 0. As the rate b increase, the horizontal scale reduce, thus squeezing everything leftwards and upwards.

Beta distribution
1 if a=b=1, beta becomes uniform distribution.
2 if a and b are both less than 1, it get a bimodal distribution with spikes at 0 and 1
3 if and b are both greater than 1, the distribution is unimodal.

Pareto distribution is used to model the distribution of quantities that exhibit long tails, i.e., heavy tails.
1 if we plot the distribution on a log-log scale, it forms a straight line, of the form logp(x)=alogx + c This is known as a power law.

Joint Probability Distribution

Uncorrelated does not imply independent.
Note that the correlation reflects the noisiness and direction of a linear relationship, but not the slope of that relationship, nor many aspects of nonlinear relationships.

Multivariate Gaussian distribution

Multivariate Student's t distribution

Dirichlet distribution
natural generalization of the beta distribution

Transformation of random variables

Linear transformation

General transformation - Jacobian matrix J

Central limit theorem

Monte Carlo approximation

Generate S samples from the distribution, e.g., MCMC(Monte Carlo Approximation), use Monte Carlo to approximate the expected value of any function of a random variable. That is, simple draw samples, and then compute the mean of the function applied tot he samples.

Information Theory

Entropy

The entropy of a random variable is a measure of its uncertainty.

H(X) = sum_1^K p(X=k)log_2 p(X=k)

The cross entropy is the average number of bits (short for binary digits- log base 2) needed to encode data coming from a source with distribution p when model q was used to encode the distribution.
The maximum entropy is the uniform distribution.

KL divergence

One way to measure the dissimilarity of two probability distributions, p and q, is known as the Kullback-Leibler divergence (KL divergence) or relative entropy.

KL(P(X)|Q(X)) = sum_x p(X=x) log_2(p(X=x)/q(X=x))
= sum_1^K p(X=k)log_2 p(X=k) - sum_1^K p(X=k)log_2 q(X=k)
=H(p) - H(p,q)

H(p,q) is called cross entropy, which is the average number of bits needed to encode data coming from a source with distribution p when we use model q to define the model. Hence a regular entropy is the expected number of bits if use the true model, the KL divergence is the difference between two models. In other words, the KL divergence is the average number of extra bits needed to encode the data, due to the fact hat use distribution q to encode the data instead of the true distribution.

Mutual Information

Mutual information of X and Y as the reduction in uncertainty about X after observing Y, or by symmetry, the reduction in uncertainty about Y after observing X.

I(X, Y) = KL(P(X,Y)|P(X)P(Y)) = sum_x sum_y p(x,y) log( p(x,y)/p(x)p(y))
=H(X) + H(Y) - H(X, Y)
= H(X, Y) - H(X|Y) - H(Y|X)

H(X) and H(Y) are the marginal entropies, H(X|Y) and H(Y|X) are the conditional entries, and H(X, Y) is the joint entropy of X and Y.

Thursday, July 5, 2012

Collinearity and VIF


Collinearity occurs when two or more variables are highly associated. Including them in a linear model can result in confusing, nonsensical, or misleading results, because the model cannot differentiate the contribution from each of them.

Because visits and transactions are so highly related, and also because a linear model assumes that effects are additive, an effect attributed to one variable (such as transactions) is not available in the model to be attributed jointly to another that is highly correlated (visits). This will cause the standard errors to he predictors to increase, which means that the coefficient estimates will be highly uncertain or unstable. As a practical consequence, this may cause coefficient estimates to differ dramatically from sample to sample due to minor variation in the data even when underlying relationships are the same.

The degree of collinearity in data can be assessed as the variance inflation factor(VIF). This estimates how much the standard error (variance) of a coefficient in the linear model is increased because of shared variance with other variables, compared to the situation if the variables were un-correlated or simple single predictor regression were performed.

The VIF provides a measure of shared variance among variables in a model. A common rule of thumb is that VIF > 5.0 indicates the need to mitigate collinearity.

There are three general strategies for mitigating collinearity:
- Omit variables that are highly correlated.
- Eliminate correlation by extracting principal components or factors for sets of highly correlated predictors.
- Use a method that is robust to collinearity, i.e., something other than traditional linear modeling, e.g, random forest, which only uses a subset of variables at a time. Or, use PCA to extract the first component from the variables.
In all, common approaches to fixing collinearity include omitting highly correlated variables, and using principle components or factor scores instead of individual items.







SEM to estimate Structural Equation Modeling in R

Structural models are helpful when your modeling needs meet any of the conditions:
- To evaluate interconnection of multiple data points that do not map neatly to the division between predictors and an outcome variable
- To include unobserved latent variables such as attitudes and estimate their relationships to one another or to observed data
- To estimate the overall fit between observed data and a proposed model with latent variables or complex connections.

Structural models are closely related to both linear modeling because they estimate associations and model fit, and to factor analysis because they use latent variables.

With regard to latent variables, the models can be used to estimate the association between outcomes such as purchase behavior and underlying attitudes that influence those, such as brand perception, brand preference, likelihood to purchase, and satisfaction.

Create graphical path diagram of influences and then estimate the strength of relationship for each path int he model. Such paths often concern two kinds of variables: manifest variables that are observed, i.e., that have data points, and latent variables that are conceived to underlie the observed data.

With SEM, it is feasible to do several things that improve our models: to include multiple influences, to posit unobserved concepts that underlie the observed indicators (i.e., constructs such as brand preference, likelihood to purchase, and satisfaction), it specify how those concepts influence one another, to assess the model's overall congruence to the data, and  to determine whether the model fits the data better tan alternative models.

SEM creates a graphical path diagram of influences and then estimating the strength of relationship for each path in the model. Such paths often concern two kinds of variables: manifest variables that are observed, i.e., that have data points, and latent variables that are conceived to underlie the observed data. The set of relationships among the latent variables is called the structural model, while the linkage between those elements and the observed, manifest variables is the measurement model.

Structural equation models are similar to linear regression models., but differ in three regards.
First, they assess the relationships among many variables, with models that may be more complex than simply predictors and outcomes.
Second, those relationships allow for latent variables that represent underlying constructs that are thought to be manifested imperfectly in the observed data.
Third, the models allow relationships to have multiple 'downstream' effects.

Two general approaches to SEM are the covariance-based approach (CB-SEM), which attempts to model the relationships among the variables at once and thus is a strong test of the model, and the partial least squares approach (PLS-SEM), which fits parts of the data sequentially and has less stringent requirements. 

After specify a CB-SEM model, simulate a data set using simulateData() from lavaan with reasonable guesses as to variable loadings, Use the simulated data to determine whether your model is likely to converge for the sample size your expect. 

Plot your specified model graphically and inspect it carefully to check that it is the model you intended to estimate. 

Whenever possible, specify one or two alternative models and check those in additional to your model. Before accepting a CB-SEM model, use compareFit() to demonstrate that your model fits the data better than alternatives. 


If you have data of varying quality, nominal categories, small sample, or problems converging a CB-SEM model, consider partial least squares SEM (PLS-SEM). 


##################################################################
## SEM
##################################################################
## install.packages("sem")
require(sem)

R.DHP <- readMoments(diag=FALSE, names=c("ROccAsp", "REdAsp", "FOccAsp",
                                         "FEdAsp", "RParAsp", "RIQ", "RSES", "FSES", "FIQ", "FParAsp"),
                     text="
                     .6247
                     .3269 .3669
                     .4216 .3275 .6404
                     .2137 .2742 .1124 .0839
                     .4105 .4043 .2903 .2598 .1839
                     .3240 .4047 .3054 .2786 .0489 .2220
                     .2930 .2407 .4105 .3607 .0186 .1861 .2707
                     .2995 .2863 .5191 .5007 .0782 .3355 .2302 .2950
                     .0760 .0702 .2784 .1988 .1147 .1021 .0931 -.0438 .2087
                     ")
model.dhp.1 <- specifyEquations(covs="RGenAsp, FGenAsp", text="
                                RGenAsp = gam11*RParAsp + gam12*RIQ + gam13*RSES + gam14*FSES + beta12*FGenAsp
                                FGenAsp = gam23*RSES + gam24*FSES + gam25*FIQ + gam26*FParAsp + beta21*RGenAsp
                                ROccAsp = 1*RGenAsp
                                REdAsp = lam21(1)*RGenAsp # to illustrate setting start values
                                FOccAsp = 1*FGenAsp
                                FEdAsp = lam42(1)*FGenAsp
                                ")
sem.dhp.1 <- sem(model.dhp.1, R.DHP, 329,
                 fixed.x=c('RParAsp', 'RIQ', 'RSES', 'FSES', 'FIQ', 'FParAsp'))
summary(sem.dhp.1)


Wednesday, February 8, 2012

LME to estimate Mixed Effect Models in R

In common marketing discussion, a hierarchical model estimates both group level effects and individual differences in effects. Such models are popular in marketing because they provide insight into differences among customers (heterogeneity) and distribution of preference. HLM are exemplified when we estimate the importance of effects for individuals as well as for an overall population. 

Effects that are associated with all observations are known as fixed effects, and those that differ across various grouping levels are known as random effects. 

These models are also known as mixed effect models, because the total effect for each person is composed of the effect for the overall population ( the fixed effect) plus the per-individual (random) effect. 

The difference between estimating hierarchical effects, as opposed to including the grouping variable as a factor in a standard linear model, is that a hierarchical model estimates every specified effect for each individual or group, not only a single adjustment term. 

The formula for a mixed effect model includes a grouping term + (... |group). Common models have a different intercept by group using (1|group) or different intercepts and slopes for predictors within each group using (predictor|group). To estimate an individual level model, the grouping term is typically the respondent identifier. 


Hierarchical model can be used to group observations at other levels than the individual level. For example, we might wish to group by store, advertising campaign, salesperson, or some other factor, if we went o estimate effects that are specific to such a grouping. 


Hierarchical models in marketing are often estimated with Bayesian methods that are able to pool information and produce best estimates of both group and individual effects using potentially sparse data. 

Model coefficients from a hierarchical model are inspected using summaries of the many estimates that are collected in an mcmc object.

library(nlme)
model1<-lme(mathach ~ 1, random = ~ 1 | id, data=hsb)
summary(model1)

Saturday, February 4, 2012

SQL to Compute User Age


drop table tmptbl;
create table lh_tmptbl as
select aa.userhashkey, min(aa.datekey) as firstdt
from factevent aa
where aa.userhashkey % 100 = 0 and aa.eventtypeid = 1 and aa.uac = 1 and aa.datekey <= 20130728
group by 1
having min(aa.datekey)>=20130522 and min(aa.datekey)<=20130628;

drop table tmp2;
create table tmp2 as
select  tbl1.firstdt,
        TO_DATE(cast(tbl1.datekey as VARCHAR), 'YYYYMMDD')-  TO_DATE(cast(tbl1.firstdt as VARCHAR), 'YYYYMMDD') as age,
        count(*) as imps
from (
select b.firstdt, a.datekey, a.userhashkey, a.eventkey
from factevent a,
     tmptbl b
where a.userhashkey = b.userhashkey and a.eventtypeid = 1 and a.uac = 1
and a.datekey <= 20130728
) tbl1
group by 1,2
;

drop table tmp3;
create table tmp3 as
select tbl1.firstdt,
count(distinct userhashkey) as uusers,
count(*) as imps
from (
select b.firstdt, a.datekey, a.userhashkey, a.eventkey
from factevent a,
     lh_tmptbl b
where a.userhashkey = b.userhashkey and a.eventtypeid = 1 and a.uac = 1
and a.datekey <= 20130728
) tbl1
group by 1;

-- R for triangle output;
sql <- paste("select a.*, b.uusers, b.imps as totimp
             from tmp2 a,
                  tmp3 b
             where a.firstdt = b.firstdt
             order by a.firstdt, a.age;" , sep ='');
data <- sqlQuery(con, sql, as.is = T, max =0 )
data$age <- as.integer(data$age)
head(data)
x <- dcast(data, firstdt+uusers+totimp ~ age, value.var = "imps", fill = 0);
head(x)
write.csv(x, "totimp.csv")

#break down by weeks;
sql <- paste("select a.firstdt, b.uusers,
             (case when a.age = 0 then 1
                  when a.age >=1 and a.age <=7 then 2
                  when a.age >=8 and a.age <=30 then 3
                  when a.age >=31 then 4 end) as agegrp,
             sum(a.imps) as grpimp
             from tmp2 a,
                  tmp3 b
             where a.firstdt = b.firstdt
             and a.firstdt < 20130729
             group by 1,2,3
             order by 1,2,3;" , sep ='');
data <- sqlQuery(con, sql, as.is = T, max =0 )
head(data)
x <- dcast(data, firstdt+uusers ~ agegrp, value.var = "grpimp", fill = 0);
head(x)
write.csv(x, "totimp2.csv")

-- 30-30-30 rules
create table userfiltering as
select userhashkey as userid, sectionid, datekey
from database1 join database2 using(sectionkey)
where userhashkey % 1000 = 0 and userhashkey <> 0;

create table userfiltering2 as
select *,
row_number() over(partition by userid order by userid, datekey, sectionid) user_cumcnt,
row_number() over(partition by userid, sectionid order by userid, datekey, sectionid) user_fact_cumcnt,
row_number() over(partition by userid, datekey order by userid, datekey, sectionid) user_dt_cumcnt,
row_number() over(partition by userid, datekey, sectionid order by userid, datekey, sectionid) user_fact_dt_cumcnt
from userfiltering
order by userid, datekey, sectionid;
select min(datekey), max(datekey)
from userfiltering2;

--cutoff1=20111128, cutoff2=20111229, gaps=31;
create table userfiltering3 as
select userid,
min(datekey) as first_dt,
max(datekey) as last_dt,
to_date(max(datekey),'YYYYMMDD') - to_date(min(datekey),'YYYYMMDD')as gaps,
(case when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=0  then 1
 when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>0 and (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=31 then 2
 when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>31  then 3 end) as index1,
(case when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=0  then
when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>0 and (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=31 then 2
when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>31  then 3 end) as index2
from userfiltering2
group by userid
order by userid;

--before 30-30-30 rules;
select count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3;

--apply 30-30-30 rules;
select count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3
where index1=2 and index2=2;

--break down by first-dt cohort;
select
first_dt,
count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3
where index1=2 and index2=2
group by 1
order by 1;

Monday, January 16, 2012

SQL to Self Join

-- Mail Solicitation;
-- create features;
drop table ANALYTICS_STG..CAMPAIGN_ADDRESS_3;
-- 2,038,831,646 rows affected
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_3 as
select a.*,
row_number() over(partition by a.EMAIL_ADDRESS_ID order by a.EMAIL_ADDRESS_ID, a.SEND_TS) seq_freq,
extract(DAY from a.SEND_TS - LAG(a.SEND_TS, 1) OVER (partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEND_TS)) as recency,
b.reach_freq as tot_freq
from ANALYTICS_STG..CAMPAIGN_ADDRESS_2 a
join (select EMAIL_ADDRESS_ID, count(*) as reach_freq from ANALYTICS_STG..CAMPAIGN_ADDRESS_2 group by 1) b
on a.EMAIL_ADDRESS_ID = b.EMAIL_ADDRESS_ID;


-- Mail Response;
-- Dedupe response;
create table ANALYTICS_STG..RESPONSE_ADDRESS_2 as
select a.*,
case when upper(a.FEEDBACK_EVENT_CD)='OPEN' then 1 else 0 end FEEDBACK_EVENT_CD_1,
case when upper(a.FEEDBACK_EVENT_CD)='CLICK' then 1 else 0 end FEEDBACK_EVENT_CD_2,
case when upper(a.FEEDBACK_EVENT_CD)='UNDEL' then 1 else 0 end FEEDBACK_EVENT_CD_3,
case when upper(a.FEEDBACK_EVENT_CD)='CONVERSION' then 1 else 0 end FEEDBACK_EVENT_CD_4,
case when upper(a.FEEDBACK_EVENT_CD)='UNSUB' then 1 else 0 end FEEDBACK_EVENT_CD_5,
case when upper(a.FEEDBACK_EVENT_CD)='SPAM' then 1 else 0 end FEEDBACK_EVENT_CD_6
from (
select *, row_number()
over (partition by CAMPAIGN_ID, ELECTRONIC_ADDRESS_ID, CONCEPT_ID, FEEDBACK_EVENT_CD, MAIL_TS
order by  CAMPAIGN_ID, ELECTRONIC_ADDRESS_ID, CONCEPT_ID, FEEDBACK_EVENT_CD, MAIL_TS, FEEDBACK_EVENT_TS
) row_id
from ANALYTICS_STG..RESPONSE_ADDRESS_1
) a
where a.row_id = 1;

-- create response features
create table ANALYTICS_STG..RESPONSE_ADDRESS_3 as
select a.*,
row_number() over (partition by a.ELECTRONIC_ADDRESS_ID order by a.ELECTRONIC_ADDRESS_ID, a.MAIL_TS, a.FEEDBACK_EVENT_TS) seq_engagement,
b.tot_engagement,
b.tot_open,
b.tot_click,
b.tot_unsub,
extract(DAY from (a.FEEDBACK_EVENT_TS - a.MAIL_TS)) as feedback_day
from ANALYTICS_STG..RESPONSE_ADDRESS_2 a
inner join (select ELECTRONIC_ADDRESS_ID,
count(*) as tot_engagement,
sum(FEEDBACK_EVENT_CD_1) as tot_open,
sum(FEEDBACK_EVENT_CD_2) as tot_click,
sum(FEEDBACK_EVENT_CD_5) as tot_unsub
from ANALYTICS_STG..RESPONSE_ADDRESS_2
where FEEDBACK_EVENT_CD_1 = 1 or FEEDBACK_EVENT_CD_2 = 1 or FEEDBACK_EVENT_CD_5 = 1
group by 1
) b
on a.ELECTRONIC_ADDRESS_ID = b.ELECTRONIC_ADDRESS_ID
where a.FEEDBACK_EVENT_CD_1 = 1 or a.FEEDBACK_EVENT_CD_2 = 1 or a.FEEDBACK_EVENT_CD_5 = 1;

-- Join solicitations and responses
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_1 as
select a.*,
b.FEEDBACK_EVENT_TS, 
b.FEEDBACK_EVENT_CD, 
b.FEEDBACK_EVENT_CD_1,
b.FEEDBACK_EVENT_CD_2,
b.FEEDBACK_EVENT_CD_5,
b.SEQ_ENGAGEMENT,
b.TOT_CLICK,
b.TOT_OPEN,
b.TOT_ENGAGEMENT,
b.TOT_UNSUB,
b.FEEDBACK_DAY
from  ANALYTICS_STG..CAMPAIGN_ADDRESS_3 a
inner join ANALYTICS_STG..RESPONSE_ADDRESS_3 b
on a.CAMPAIGN_ID = b.CAMPAIGN_ID
and a.CONCEPT_ID = b.CONCEPT_ID
and a.EMAIL_ADDRESS_ID = b.ELECTRONIC_ADDRESS_ID
and a.SEND_TS = b.MAIL_TS
;

-- Last attributions;
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_2 as
select a.*,
b.FEEDBACK_EVENT_TS, 
b.FEEDBACK_EVENT_CD, 
b.FEEDBACK_EVENT_CD_1, 
b.FEEDBACK_EVENT_CD_2, 
b.FEEDBACK_EVENT_CD_5, 
b.TOT_ENGAGEMENT, 
b.SEQ_ENGAGEMENT, 
b.TOT_CLICK, 
b.TOT_OPEN, 
b.TOT_UNSUB,
b.FEEDBACK_DAY,
SUM(case when b.FEEDBACK_EVENT_CD_1=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_OPEN_PERC,
SUM(case when b.FEEDBACK_EVENT_CD_2=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_CLICK_PERC,
SUM(case when b.FEEDBACK_EVENT_CD_5=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_UNSUB_PERC
from ANALYTICS_STG..CAMPAIGN_ADDRESS_3 a
left join ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_1 b
on a.CAMPAIGN_ID=b.CAMPAIGN_ID
and a.CONCEPT_ID=b.CONCEPT_ID
and a.EMAIL_ADDRESS_ID = b.EMAIL_ADDRESS_ID
and a.SEQ_FREQ = b.SEQ_FREQ

;

SQL to Batchloading

sn <- "*********"
sql.con <- odbcConnect(dsn)
sql.db <- "*********"
advertiserid <- "4009"

rtg.facts <- unlist(sqlQuery(sql.con, paste("select distinct t.FactId
      from ", sql.db, ".dbo.Segment s with (nolock)
                                            inner join ", sql.db, ".dbo.SegmentTaxonomy st with (nolock)
                                            on s.SegmentId = st.SegmentId
                                            inner join ", sql.db, ".dbo.Taxonomy t with (nolock)
                                            on st.TaxonomyId = t.TaxonomyId
                                            where s.AdvertiserId = ", advertiserid, sep = "")))
str <- paste(rtg.facts, collapse= ",")
pfacts <- sqlQuery(sql.con, paste("exec ", sql.db, ".dbo**** ", advertiserid, sep = ""))
odbcClose(sql.con)

#save(str, file="str.rda",ascii=FALSE)
load(file = "str.rda")
load(file = "pfacts.rda")

pfacts <- data.frame(factid = as.integer(pfacts[,1]))
dsn <- "********"
con <- odbcConnect(dsn)
#sqlSave(con, pfacts1, tablename="Emily", rownames=FALSE, colnames= FALSE, append=TRUE, varTypes=varTypes)
odbcClose(con)

write.table(coef, file = "ling_coef.csv", row.names = F, col.names = F, sep = ",")

system("cd .\\SVM");
system("winscp.bat");

sqlQuery(con, "drop table lh_coef")
sqlQuery(con, "CREATE TABLE lh_coef (
                    Distance double precision,
                    center double precision,
                    scale double precision,
                    FactId int distkey
                    )
                    sortkey(factid)")
sqlQuery(con, "copy lh_coef from \'/home/coef.csv\' with delimiter ','")

SQL to Merge Imps and Convs


select count(*) from lh_imps;
select * from lh_imps limit 10;
--drop table lh_imps;
create table lh_imps as
select a.userhashkey, a.datekey,  a.revenue,
       c.adid, d.sectionid,
       to_timestamp(a.datekey * 1000000.0 + a.timekey - 1000000, 'YYYYMMDDHHMISS') as impdatetime
from databasefactevent a
join databasedimcampaign b using(campaignkey)
join databasedimad c using(adkey)
join databasedimsection d using(sectionkey)
where a.eventtypeid = 1
and b.campaignid = 57464
and a.datekey between 20120901 and 20120930
order by a.userhashkey, impdatetime;


select count(*) from lh_imps2;
select * from lh_imps2 limit 10;
--drop table lh_imps2;
create table lh_imps2 as
select a.*,
       row_number() over(partition by userhashkey order by userhashkey, impdatetime, adid, sectionid) frequency,
       sum(revenue) over(partition by userhashkey order by userhashkey, impdatetime, adid, sectionid) cum_revenue
from lh_imps a;

select count(*) from lh_conv;
select * from lh_conv limit 10;
--drop table lh_conv;
create table lh_conv as
select a.eventkey, a.datekey as convdate, a.userhashkey, c.adid, d.sectionid,
       to_timestamp(a.datekey * 1000000.0 + a.timekey - 1000000, 'YYYYMMDDHHMISS') as convdatetime
from databasefactconversion a
join databasedimcampaign b using(campaignkey)
join databasedimad c using(adkey)
join databasedimsection d using(sectionkey)
where b.campaignid = 57464
and a.datekey between 20120901 and 20120930;

select count(*) from lh_conv2;
select * from lh_conv2 limit 10;
--drop table lh_conv2;
create table lh_conv2 as
select b.eventkey, b.userhashkey, b.adid, b.sectionid, b.convdate,  b.convdatetime,
       max(a.impdatetime) as lastImpTime,
       max(a.datekey) as lastImpdatekey,
       max(a.frequency) as lastImpfrequency
from lh_imps2 a
join lh_conv b
on a.userhashkey = b.userhashkey and a.adid = b.adid and a.sectionid = b.sectionid
where a.impdatetime < b.convdatetime
group by 1, 2, 3, 4, 5, 6;

select count(*) from lh_conv3;
select count(*) from lh_conv3 where converted = 1;
select * from lh_conv3 where converted = 1 limit 10;
--drop table lh_conv3;
create table lh_conv3 as
select a.*,
       (case when b.userhashkey is not null then 1 else 0 end ) converted,
       (case when b.userhashkey is not null then b.convdate - a.datekey  else -1 end ) days2conv
from lh_imps2 a
left join lh_conv2 b
on a.userhashkey = b.userhashkey and a.adid = b.adid and a.sectionid = b.sectionid and a.impdatetime = b.lastImpTime;

select userhashkey, adid, sectionid, impdatetime, count(*)
from lh_conv3
where converted = 1
group by 1,2,3,4
having count(*)>2;

select count(*) from lh_conv4;
select count(*) from lh_conv4 where converted = 1;
select * from lh_conv4 where converted = 1 limit 10;
select * from lh_conv4 where converted = 0 limit 10;
--drop table lh_conv4;
create table lh_conv4 as
select a.*,
       (case when b.userhashkey is not null then 1 else 0 end ) converted,
       (case when b.userhashkey is not null then b.convdate - a.datekey  else -1 end ) days2conv
from lh_imps2 a
left join (
select userhashkey, adid, sectionid, max(convdate) as convdate,
       max(convdatetime) as convdatetime,
       max(lastImpTime) as lastImpTime,
       max(lastImpdatekey) as lastImpdatekey
from lh_conv2
group by 1,2,3
) b
on a.userhashkey = b.userhashkey and a.adid = b.adid and a.sectionid = b.sectionid and a.impdatetime = b.lastImpTime;