My DS Coding Bolg: 2014

Friday, October 17, 2014

NLS model in R

names(data) <- c("freq", "imp", "uuser", "click", "action");
data$CTR <- data$click/data$imp;
data$CVR <- data$action/data$imp;

data <- data[1:41,];
beta0 <- sum(data$action)/sum(data$imp);
beta1 <- max(data$action/data$imp);
beta2 <- -5/log(1/2);

decay <- nls(formula = CVR ~ b0 + b1 *exp(-freq/b2),
data=data,
start=c(b0=beta0, b1=beta1, b2=beta2),
trace = T)
summary(decay)
##Residual standard error: 5.08e-06 on 137648 degrees of freedom
## the coefficients only:
coef(decay)
## including their SE, etc:
coef(summary(decay))
plot(data$freq,data$CVR, main = "nls(*), data, true function and fit, n=100")
#curve(beta0 + beta1 * exp(-x / beta2), col = 4, add = TRUE)
lines(data$freq, predict(decay), col = 2)

result <- data.frame(data$freq,data$CVR,predict(decay));

ggplot(data, aes(y=data$CVR, x=data$freq)) +
geom_point(stat = "identity", fill = "red", size = 3) +
# stat_smooth() +
geom_line(aes(y=predict(decay), x=data$freq), color = 'red', size = 1, lty = 2) +
ggtitle("CVR by Frequency") +
ylab("CVR") + xlab("Frequency in one day") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank()) +
scale_x_discrete(breaks = as.character(seq(0, 30, by = 2)))

##starting value
negexp.sv <- function(x,y){
mx <- mean(x)
x1<-x-mx
x2<-((x-mx)^2)/2
b <- as.vector(lm(y~x1+x2)$coef)
b2 <- b[2]/b[3]
b<-as.vector(lm(y~exp(-x/b2))$coef)
parms <- cbind(b[1],b[2],b2)
}
b.start <- negexp.sv(data$freq, data$CVR)
##specify gradient
expn <- function(b0,b1,b2,x){
temp <- exp(-x/b2)
model.func <- b0+b1*temp
D <- cbind(1, temp, (b1*x*temp)/b2^2)
dimnames(D) <- list(NULL, c("b0","b1","b2"))
attr(model.func, "gradient") <- D
model.func
}

decay <- nls(formula = CVR ~ expn(b0, b1, b2, frequency),
data=data,
## start=c(b0=b.start[1], b1=b.start[2], b2=b.start[3]),
start=c(b0=beta0, b1=beta1, b2=beta2),
trace = T)
summary(decay)
##Residual standard error: 5.08e-06 on 137648 degrees of freedom

for (i in seq(300,1000,100)){
print(i);
data1 <- data[1:i,];
decay <- nls(formula = CVR ~ expn(b0, b1, b2, frequency),
data=data1,
## start=c(b0=b.start[1], b1=b.start[2], b2=b.start[3]),
start=c(b0=beta0, b1=beta1, b2=beta2),
trace = F)
print(coef(summary(decay)))

}

Thursday, October 9, 2014

Text Mining - ngram in R

##Install the needed packages
Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", "cluster", "igraph", "fpc")
install.packages(Needed, dependencies=TRUE)

install.packages("Rcampdf", repos = "http://datacube.wu.ac.at/", type = "source")
#slam needs to be installed
install.packages("/Users/tkmahll/Downloads/slam_0.1-37.tgz", repos = NULL, type="source")

##load the library
library(xlsx)
library(tm)
library(SnowballC)
library(RWeka)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)

#read in the dataset
sellouts <- read.xlsx("sellouts.xlsx", sheetName = "Products")

head(sellouts$Name)
names(sellouts)
sellouts$Name[1:100]

##select only the Product Name column for analysis
sellout.prod <- as.list(sellouts[,1])

##Preprocess the dataset
#Remove punctuation
product <- Corpus(VectorSource(sellout.prod))
product <- tm_map(product, removePunctuation)
product <- tm_map(product, removeNumbers)
product <- tm_map(product, tolower)
product <- tm_map(product, removeWords, stopwords("english"))
#product <- tm_map(product, stemDocument)
product <- tm_map(product, stripWhitespace)

writeLines(as.character(product[[333]])) #333, 1227, 1228, 1229, 337

#keep it as plaintextdocument
product <- tm_map(product, PlainTextDocument)
#change to the character type
product.1 <- lapply(product, as.character)

#create the object
prod.docs <- Corpus(VectorSource(product.1))
prod.docs <- tm_map(prod.docs, PlainTextDocument)

#the function for conducting TermDocumentMatrix
BigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

tdm <- TermDocumentMatrix(prod.docs, control = list(tokenize = BigramTokenizer))
class(tdm)
dtm <- DocumentTermMatrix(prod.docs, control = list(tokenize = BigramTokenizer))

##Create a WordCloud to Visualize the Text Data
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)

wf <- data.frame(word=names(freq), freq=freq)
head(wf)

# Create the word cloud
set.seed(123)
pal = brewer.pal(9,"BuPu")
wordcloud(words = wf$word,
freq = wf$freq,
scale = c(3,.8),
random.order = F,
colors = pal,
max.words = 30)

##Draw the plot of the frequency
library(ggplot2)
p <- ggplot(subset(wf, freq>10), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

####################################
##Tri-gram analysis
#the function for conducting TermDocumentMatrix
TrigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

tdm.tri <- TermDocumentMatrix(prod.docs, control = list(tokenize = TrigramTokenizer))
class(tdm)
dtm.tri <- DocumentTermMatrix(prod.docs, control = list(tokenize = TrigramTokenizer))

##Create a WordCloud to Visualize the Text Data
freq.tri <- sort(colSums(as.matrix(dtm.tri)), decreasing=TRUE)
head(freq.tri, 20)

wf.tri <- data.frame(word=names(freq.tri), freq=freq.tri)
head(wf.tri)

##Draw the plot of the frequency
p.tri <- ggplot(subset(wf.tri, freq>3), aes(word, freq))
p.tri <- p.tri + geom_bar(stat="identity")
p.tri <- p.tri + theme(axis.text.x=element_text(angle=45, hjust=1))
p.tri

Friday, October 3, 2014

Text Mining - word cloud in R

##### Read-in File

```
memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))
sessionInfo()

library(data.table)
library(NLP)
library(tm) # Framework for text mining.
library(SnowballC) # Provides wordStem() for stemming.
library(RColorBrewer) # Generate palette of colours for plots.
library(ggplot2) # Plot word frequencies.
library(Rgraphviz) # Correlation plots.

setwd("seach index");

search = read.csv("search201408.txt", row.names = NULL, header = F, sep = "\t", quote = "", stringsAsFactors = FALSE)
search = data.table(search)
search <- search[,V2:=NULL]
setnames(search, c("query", "count"))

> dim(search)
[1] 669546 2
> head(search)
query count
1: toyota 303077
2: toyota trucks 223888
3: toyota.com 115706
4: toyota tacoma 62982
5: toyota highlander 48905
6: 2015 toyota camry 46997

search1 <- search[count>1000, ];
dim(search1)

search1$idx <- 1:nrow(search1)
tmp = dcast.data.table(search1, idx ~ query, value.var = "count", fun = sum)
tmp = tmp[,idx:=NULL]
dtm <- tm::as.DocumentTermMatrix(tmp, weighting=weightTf)
dtm

library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=5000, colors=brewer.pal(6, "Dark2"))

```

##### Loading a Corpus
```
doc.frame <- DataframeSource(search)
doc.corpus <- Corpus(doc.frame)
class(doc.corpus)
class(doc.corpus[[1]])

```

##### Exploring a Corpus

```

inspect(doc.corpus[1])

```

#### Preparing the Corpus
```
---conversion to lower case
doc.corpus <- tm_map(doc.corpus, tolower)

---remove nubmers
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1])

---remove punctuation
doc.corpus <- tm_map(doc.corpus, removePunctuation)
inspect(doc.corpus[11])

---remove english stop words
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1111])

---remove own stop words
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
inspect(doc.corpus[1111])

---remove strip whitespace
doc.corpus <- tm_map(doc.corpus, stripWhitespace)

---specific transformations
for (j in seq(doc.corpus))
{
doc.corpus[[j]] <- gsub("\t\t", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("/", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("@", " ", doc.corpus[[j]])
}
inspect(doc.corpus[16])

doc.corpus <- tm_map(doc.corpus, PlainTextDocument)
```

#### Stemming
```
library(SnowballC)
doc.corpus <- tm_map(doc.corpus, stemDocument)
```

#### Creating a Document term Matrix
```
dtm <- DocumentTermMatrix(doc.corpus)
inspect(dtm[1:5, 1000:1005])
inspect(dtm[1:5, 100:105])

tdm <- TermDocumentMatrix(doc.corpus)
tdm
```

#### Exploring the Document Term Matrix
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)
tail(freq, 20)

```

#### Distribution of Term Frequencies

```
dim(dtm)
dtms <- removeSparseTerms(dtm, 0.5)
dim(dtms)
inspect(dtms)

freq <- colSums(as.matrix(dtms))
freq
table(freq)

```

#### Identifying Frequent Items and Associations

```
findFreqTerms(dtm, lowfreq=1000)

findFreqTerms(dtm, lowfreq=100)

findAssocs(dtm, "think", corlimit=0.6)

```

#### Correlations Plots

```
--plot(dtm,terms=findFreqTerms(dtm, lowfreq=100)[1:50], corThreshold=0.5)
```

#### Plotting Word Frequencies
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 14)

wf <- data.frame(word=names(freq), freq=freq)
head(wf)

p <- ggplot(subset(wf, freq>500), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
```

#### Plotting Word Cloud

```
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=500)
```

Friday, September 26, 2014

MCMC in R

Random Variable Generation

Monte Carlo methods rely on the possibility of producing a supposedly endless flow of random variables, and for well-known or new distributions. Such a simulation is, in turn, based on the production of uniform random variables on the interval (0, 1)

# check the generator
Nsim = 10^4
x=runif(Nsim)
x1=x[-Nsim]
x2=x[-1]
par(mfrow=c(1,3))
hist(x)
plot(x1,x2)
acf(x)

# Inverse Transform
Probability Integral Transform is related to the result that data values that are modelled as being random variables from any given continuous distribution can be converted to random variables having a standard uniform distribution.

# generate exponential
Nsim = 10^4
U=runif(Nsim)
X=-log(U)
Y=rexp(Nsim)
par(mfrow=c(1,2))
hist(X,freq=F)
hist(Y,freq=F)

# generate Chisq(6)
U=runif(3*10^4)
U=matrix(data=U, nrow=3)
X=-log(U)
X=2*apply(X,2,sum)
Y=rchisq(Nsim,6)
par(mfrow=c(1,2))
hist(X,freq=F)
hist(Y,freq=F)

# generate normal by Box-Muller
U=runif(2*10^4)
U=matrix(data=U, ncol=2)
X1=sqrt(-2*log(U[,1]))*cos(2*pi*U[,2])
X2=sqrt(-2*log(U[,1]))*sin(2*pi*U[,2])
Y=rnorm(10^4)
par(mfrow=c(1,3))
hist(X1,freq=F)
hist(X2,freq=F)
hist(Y)

# Poisson for large lambda
Nsim=10^4
lambda=100
spread=3*sqrt(lambda)
t=round(seq(max(0,lambda-spread),lambda+spread,1))
prob=ppois(t,lambda)
X=rep(0,Nsim)
for (i in 1:Nsim){
u=runif(1)
X[i]=t[1]+sum(prob<u)-1
}
hist(X,freq=F)

# Accept-Reject Algorithm to generate beta with Uniform
1 We generate a candidate random variable
2 Only accept it subject to passing a test

Nsim=2500
a=2.7
b=6.3
M=2.67
u=runif(Nsim, max=M)
y=runif(Nsim)
x=y[u<dbeta(y,a,b)]
hist(X,freq=F,breaks=100)

U = runif(100000,0,1)
accept = c()
for(i in 1:length(U)){
U1 = runif(1, 0, 1)
if(dunif(U[i], 0, 1)*3*U1 <= dbeta(U[i], 6, 3)) {
accept[i] = 'Yes'
}
else if(dunif(U[i],0,1)*3*U1 > dbeta(U[i], 6, 3)) {
accept[i] = 'No'
}
}
T = data.frame(U, accept = factor(accept, levels= c('Yes','No')))
hist(T[,1][T$accept=='Yes'], breaks = seq(0,1,0.01), freq = FALSE, main = 'Histogram of X', xlab = 'X')

Monte Carlo Integration

The validity of Monte Carlo approximations relies on the Law of Large Numbers
The versatility of the representation of an integral as an expectation

We apply probabilistic results of Law of Large Numbers and Central Limit Theorem,

Friday, September 19, 2014

View the source code for a function in R

The S3 method dispatch system
> methods(t)
[1] t.data.frame t.default t.ts*

Non-visible functions are asterisked
> methods(class="ts")
[1] aggregate.ts as.data.frame.ts cbind.ts* cycle.ts*
[5] diffinv.ts* diff.ts kernapply.ts* lines.ts
[9] monthplot.ts* na.omit.ts* Ops.ts* plot.ts
[13] print.ts time.ts* [<-.ts* [.ts*
[17] t.ts* window<-.ts* window.ts*

Non-visible functions are asterisked
"Non-visible functions are asterisked" means the function is not exported from its package's namespace. You can still view its source code via the ::: function, or by using getAnywhere(). getAnywhere() is useful because you don't have to know which package the function came from.
> getAnywhere(t.ts)
A single object matching ‘t.ts’ was found
It was found in the following places
registered S3 method for t from namespace stats
namespace:stats
with value

function (x)
{
cl <- oldClass(x)
other <- !(cl %in% c("ts", "mts"))
class(x) <- if (any(other))
cl[other]
attr(x, "tsp") <- NULL
t(x)
}
<bytecode: 0x294e410>
<environment: namespace:stats>

The S4 method dispatch system

The S4 system is a newer method dispatch system and is an alternative to the S3 system. Here is an example of an S4 function:
> library(Matrix)
Loading required package: lattice
> chol2inv
standardGeneric for "chol2inv" defined from package "base"

function (x, ...)
standardGeneric("chol2inv")
<bytecode: 0x000000000eafd790>
<environment: 0x000000000eb06f10>
Methods may be defined for arguments: x
Use showMethods("chol2inv") for currently available ones.

The output already offers a lot of information. standardGeneric is an indicator of an S4 function. The method to see defined S4 methods is offered helpfully:

> showMethods(chol2inv)
Function: chol2inv (package base)
x="ANY"
x="CHMfactor"
x="denseMatrix"
x="diagonalMatrix"
x="dtrMatrix"
x="sparseMatrix"

getMethod can be used to see the source code of one of the methods:

> getMethod("chol2inv", "diagonalMatrix")
Method Definition:

function (x, ...)
{
chk.s(...)
tcrossprod(solve(x))
}
<bytecode: 0x000000000ea2cc70>
<environment: namespace:Matrix>

Signatures:
x
target "diagonalMatrix"
defined "diagonalMatrix"

There are also methods with more complex signatures for each method, for example

require(raster)
showMethods(extract)
Function: extract (package raster)
x="Raster", y="data.frame"
x="Raster", y="Extent"
x="Raster", y="matrix"
x="Raster", y="SpatialLines"
x="Raster", y="SpatialPoints"
x="Raster", y="SpatialPolygons"
x="Raster", y="vector"

To see the source code for one of these methods the entire signature must be supplied, e.g.
getMethod("extract" , signature = c( x = "Raster" , y = "SpatialPolygons") )

Functions that call compiled code

Note that "compiled" does not refer to byte-compiled R code as created by the compiler package. The <bytecode: 0x294e410> line in the above output indicates that the function is byte-compiled, and you can still view the source from the R command line.

Functions that call .C, .Call, .Fortran, .External, .Internal, or .Primitive are calling entry points in compiled code, so you will have to look at sources of the compiled code if you want to fully understand the function. Packages may use .C, .Call, .Fortran, and .External; but not .Internal or .Primitive, because these are used to call functions built into the R interpreter.

Calls to some of the above functions may use an object instead of a character string to reference the compiled function. In those cases, the object is of class "NativeSymbolInfo", "RegisteredNativeSymbol", or "NativeSymbol"; and printing the object yields useful information. For example, optim calls .External2(C_optimhess, res$par, fn1, gr1, con) (note that's C_optimhess, not "C_optimhess"). optim is in the stats package, so you can type stats:::C_optimhess to see information about the compiled function being called.

Compiled code in a package

If you want to view compiled code in a package, you will need to download/unpack the package source. The installed binaries are not sufficient. A package's source code is available from the same CRAN (or CRAN compatible) repository that the package was originally installed from. The download.packages() function can get the package source for you.

download.packages(pkgs = "Matrix",
destdir = ".",
type = "source")

This will download the source version of the Matrix package and save the corresponding .tar.gz file in the current directory. Source code for compiled functions can be found in the src directory of the uncompressed and untared file. The uncompressing and untaring step can be done outside of R, or from within R using the untar() function. It is possible to combine the download and expansion step into a single call (note that only one package at a time can be downloaded and unpacked in this way):

untar(download.packages(pkgs = "Matrix",
destdir = ".",
type = "source")[,2])

Alternatively, if the package development is hosted publicly (e.g. via GitHub, R-Forge, or RForge.net), you can probably browse the source code online.

Compiled code in a base package

Certain packages are considered "base" packages. These packages ship with R and their version is locked to the version of R. Examples include base, compiler, stats, and utils. As such, they are not available as separate downloadable packages on CRAN as described above. Rather, they are part of the R source tree in individual package directories under /src/library/. How to access the R source is described in the next section.

Compiled code built into the R interpreter

If you want to view the code built-in to the R interpreter, you will need to download/unpack the R sources; or you can view the sources online via the R Subversion repository or Winston Chang's github mirror.

Uwe Ligges's R news article (PDF) (p. 43) is a good general reference of how to view the source code for .Internal and .Primitive functions. The basic steps are to first look for the function name in src/main/names.c and then search for the "C-entry" name in the files in src/main/*.

Monday, September 15, 2014

Cluster Analysis in R

memory.limit()

memory.size(max = TRUE)

rm(list=ls(all=T))

library(MARSS)

library(ggplot2)

library(reshape2)

library(data.table)

#############################
## Hierarchical Clustering
#############################
## could generate more features;
### growth <- function(input)
### momentum <- function(input)
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))

## compute distance
d <- dist(data[,c(5,6,7)])
m <- nrow(data)
as.matrix(d)[1:5, 1:5]

## cluster methods
hc_cluster <- hclust(d, method="complete")
plot(hc_cluster)
rect.hclust(hc_cluster, k=5, border="red")
data$segments <- cutree(hc_cluster, k=5)

## examine results
ddply(data, c('segments'), summarise,
n=length(growth),
growthmean=mean(growth),
momentummean=mean(momentum))

rect <- data.table(ymin=round(min(data$growth),2),
ymax=round(max(data$growth),2),
xmin=round(min(data$momentum),2),
xmax=round(max(data$momentum),2))

pdf("~/Desktop/plot3.pdf", width=15, height=8.5)
ggplot(data, aes(y=growth, x=momentum, size=scalecnt, label=style), guide=FALSE) +
geom_point(position = "jitter", colour= "red", fill= "red", shape=21) +
geom_text_repel(aes(y = growth, x = momentum, color=factor(segments)), size=5, angle = -15) +
theme_bw() +
ggtitle("Cluster Analysis") +
xlab("Growth") + ylab("Momemtum") +
scale_y_continuous(name="Growth", labels=scales::percent, limits=c(-max(abs(rect$ymin),abs(rect$ymax))*1.4, max(abs(rect$ymin),abs(rect$ymax))*1.4)) +
scale_x_continuous(name="Momentum", labels=scales::percent, limits=c(-max(abs(rect$xmin),abs(rect$xmax))*1.4, max(abs(rect$xmin),abs(rect$xmax))*1.4)) +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 10)) +
theme(axis.text.y = element_text(face = "bold", size = 10)) +
theme(axis.title.x = element_text(face = "bold", size = 10)) +
theme(axis.title.y = element_text(face = "bold", size = 10, angle = 90)) +
theme(legend.position = 'bottom') +
theme(legend.text = element_text(family = "sans", face = "bold", size = 10)) +
theme(legend.title = element_text(family = "sans", face = "bold", size = 10)) +
guides(size=guide_legend("Search Volume Index")) +
geom_vline(xintercept=c(0), linetype="dotted") +
geom_hline(yintercept=c(0), linetype="dotted")
dev.off()

#############################
## k_mean Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
kmean_cluster <- kmeans(data[,c(5,6,7)], centers=5)
summary(kmean_cluster)
data$segments = kmean_cluster$cluster
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")

##Example2

data <-fread("shopper.txt")

setnames(data, c("id","lable","ageBucket","gender","country","dvs","pvs","adviews","adclicks",

"allnumevents","wk1numevents","ratio_wk1numevents_all","alldv","wk1dv","ratio_wk1dv_all",

"itemcnt","itemtot","item_cnt_30day","item_tot_30day","item_cnt_30dayPlus","item_tot_30dayPlus"));

> dim(data)

[1] 1083701 21

data$rowid <- 1:nrow(data)

```

## Model One - Item Count

## Drop Outliers

```

data2 <- data[, c(9,20,22), with = F]

> summary(data2)

> quantile(data2$adclicks, c(.99,.999,.9999))

99% 99.9% 99.99%

6.00 45.00 159.89

> sum(data$adclicks>160)

[1] 109

data2 <- data2[data2$adclicks<160]

> summary(data2)

> quantile(data2$item_cnt_30dayPlus, c(.99,.999,.9999))

99% 99.9% 99.99%

46 129 335

> sum(data2$item_cnt_30dayPlus>335)

[1] 108

data2 <- data2[data2$item_cnt_30dayPlus<335]

```

## Log - Transform data

Apply log(x+1)/log(max(x)) transformation to generate new data

```

func1 <- function(x){

return(log(x+1)/log(max(x)))

}

data3 <- data2[, c(1,2,3), with = F]

data3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]

```

## Scale Transformed data

```

func2 <- function(x){

return( (x-mean(x))/sqrt(sum((x-mean(x))^2)/(length(x)-1)))

}

data4 <- data3[, c(1,2,3), with = F]

data4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]

```

## Partition by k-means using scaled transformed data

```

data5 <- data4[,c(1,2), with = F]

results <- data5

for (i in c(2:2)){

print('----------------------------------------------------------');

print(i);

set.seed(33);

model <- kmeans(data5, i);

print("model size");

print(cbind(1:i,model$size));

print("model centers")

print(model$centers);

results<- cbind(results, model$cluster);

}

[1] "----------------------------------------------------------"

[1] 2

[1] "model size"

[,1] [,2]

[1,] 1 667941

[2,] 2 415540

[1] "model centers"

adclicks item_cnt_30dayPlus

1 -0.09521215 -0.6450849

2 0.15304447 1.0369126

data2$cluster = results$V2

data2[,list(users = .N,

group_adclicks=sum(adclicks)/.N,

group_item_cnt_30dayPlus=sum(item_cnt_30dayPlus)/.N), by='cluster'];

```

## Model Two - Item Total

## Drop Outliers

```

data_2 <- data[, c(9,21,22), with = F]

> head(data_2)

> summary(data_2)

adclicks item_tot_30dayPlus

Min. : 0.0000 Min. : 0.00

1st Qu.: 0.0000 1st Qu.: 23.94

Median : 0.0000 Median : 65.70

Mean : 0.5131 Mean : 172.36

3rd Qu.: 0.0000 3rd Qu.: 176.10

Max. :1638.0000 Max. :225247.28

> quantile(data_2$adclicks, c(.99,.999,.9999))

99% 99.9% 99.99%

6.00 45.00 159.89

> sum(data_2$adclicks>160)

data_2 <- data_2[data_2$adclicks<160]

> summary(data_2)

> quantile(data_2$item_tot_30dayPlus, c(.99,.999,.9999))

99% 99.9% 99.99%

1510.772 4389.105 16171.602

> sum(data_2$item_tot_30dayPlus>16171)

[1] 109

data_2 <- data_2[data_2$item_tot_30dayPlus<16171]

> summary(data_2)

```

## Log - Transform data

Apply log(x+1)/log(max(x)) transformation to generate new data

```

data_3 <- data_2[, c(1,2,3), with = F]

data_3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]

```

## Scale Transformed data

```

data_4 <- data_3[, c(1,2,3), with = F]

data_4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]

```

## Partition by k-means using scaled transformed data

```

data_5 <- data_4[,c(1,2), with = F]

results_ <- data_5

for (i in c(2:2)){

print('----------------------------------------------------------');

print(i);

set.seed(33);

model <- kmeans(data_5, i);

print("model size");

print(cbind(1:i,model$size));

print("model centers")

print(model$centers);

results_<- cbind(results_, model$cluster);

}

[1] "----------------------------------------------------------"

[1] 2

[1] "model size"

[,1] [,2]

[1,] 1 545531

[2,] 2 537952

[1] "model centers"

adclicks item_tot_30dayPlus

1 0.1732808 0.7845828

2 -0.1757221 -0.7956365

data_2$cluster = 3-results_$V2

data_2[,list(users = .N,

group_adclicks=sum(adclicks)/.N,

group_item_tot_30dayPlus=sum(item_tot_30dayPlus)/.N), by='cluster'];

```

## Merge Two Results

```

setkey(data2, rowid);

setkey(data_2, rowid);

comb = merge(data2, data_2, by ="rowid", all = TRUE)

comb1 = comb[,c(4,7),with=F]

> m <- table(comb1,useNA = "always"); m

cluster.y

cluster.x 1 2 <NA>

1 464209 203732 0

2 73742 341718 80

<NA> 1 81 0

> round(prop.table(m),5)

cluster.y

cluster.x 1 2 <NA>

1 0.42841 0.18802 0.00000

2 0.06806 0.31537 0.00007

<NA> 0.00000 0.00007 0.00000

```

Number of cluster

# Important packages (libraries)

library(cluster) # Data set votes.repub, and cluster technique

library(mclust) # Clustering by mixtures of Gaussians

library(MASS) # kMeans clustering and others

library(kernlab) # Kernel kMeans, spectral clustering

# Function for finding within cluster sum of squares when K=1

WCSS1 <- function(x)

{

return(ifelse(ncol(x)>1,sum(rowSums((x - matrix(rep(apply(x,2,mean),

times=nrow(x)), byrow=T, ncol=2))^2)),(nrow(x)-1)*var(x)))

}

# Function for finding and plotting the number of cluster

nb.clust <- function(x)

{

K <- 7 # The maximum number of clusters

n <- nrow(x) # sample size

wcss <- matrix(WCSS1(x), ncol=1, nrow=K)

for (k in 2:K)

{

wcss[k] <- sum(kmeans(x, centers=k)$withinss)

}

plot(1:K, wcss, type="b", xlab="Number of Clusters",

ylab="Within cluster sum of squares")

dw <- abs(diff(wcss[-1]))

return(min(which(cumsum(dw/sum(dw))>0.8)))

}

> nb.clust(faithful)

[1] 3

> kmeans.faithful <- kmeans(faithful,3)

> kmeans.faithful$centers

> x11()

> plot(faithful, col = kmeans.faithful$cluster)

> points(kmeans.faithful$centers, col = 1:3, pch = 8, cex=6)

# Imputing

library(DMwR)

xvotes <- centralImputation(votes.repub)

kmeans(xvotes, 2)

# Note that n remains unchanged

# Exclusion

votes# x <- na.exclude(votes.repub)

kmeans(votesx, 2)

# Note that the sample size is hugely reduced

# This can be very very bad

# What difference

kmeans can handle large q small n

#############################
##Gaussian Mixture Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
X <- data[,c(5,6,7)]
BIC = mclustBIC(X)
plot(BIC)
m_cluster <- Mclust(X, x = BIC)
plot(m_cluster, what = "classification")
data$segments = m_cluster$classification
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")

Classification Models in R

n=nrow(all)
ntrain <- round(n*0.7)
set.seed(333)
tindex <- sample(n, ntrain)
all$y <- all$y==1
train <- all[tindex, ]
test <- all[-tindex, ]

##################################################################
## Naive Bayes
##################################################################
train$y=factor(train$y, levels=c(TRUE, FALSE))
newdata <- data.frame(y=test$y,
is_verified=test$is_verified,
pageviews1=test$pageviews1,
follows1=test$follows1,
likes1=test$likes1,
reblogs1=test$reblogs1,
original_posts1=test$original_posts1,
searches1=test$searches1,
unfollows1=test$unfollows1,
received_engagments1=test$received_engagments1,
time1=test$time1,
regi_geo1=test$regi_geo1)
nb1 <- NaiveBayes(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
+ searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train)
tmp <- predict(nb1, newdata = newdata)
table(tmp$class, test$y)
test$score = tmp$posterior
#calculate auc;
pred = prediction(test$score, test$y);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values); auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
abline(0, 1, lty = 2);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

##################################################################
## Decision Tree
##################################################################
require(rpart)
ct <- rpart(factor(gear) ~ ., data = train, minsplit = 3)
summary(ct)
plot(ct)
text(ct)
pred <- predict(ct, newdata = test, type = 'class')
prop.table(table(pred, test$gear))

ct <- rpart(as.factor(y) ~ is_verified + pageviews1 + follows1 + likes1
+ reblogs1 + original_posts1 + searches1 + unfollows1 + received_engagments1,
data = train, minsplit = 5)
summary(ct)
plot(ct)
text(ct)

test$score = predict(ct, newdata = test, type = "prob")[,2]
#calculate auc;
pred = prediction(test$score, test$y);
plot(performance(Pred2, "tpr", "fpr"))
abline(0, 1, lty = 2)
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values); auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
abline(0, 1, lty = 2);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

##################################################################
## KNN;
##################################################################
train_x <- data.frame(is_verified=as.numeric(train$is_verified),
pageviews1=as.numeric(train$pageviews1),
follows1=as.numeric(train$follows1),
likes1=as.numeric(train$likes1),
reblogs1=as.numeric(train$reblogs1),
original_posts1=as.numeric(train$original_posts1),
searches1=as.numeric(train$searches1),
unfollows1=as.numeric(train$unfollows1),
received_engagments1=as.numeric(train$received_engagments1),
time1=as.numeric(train$time1),
regi_geo1=as.numeric(train$regi_geo1))
train_y=train$y
test_x <- data.frame(is_verified=as.numeric(test$is_verified),
pageviews1=as.numeric(test$pageviews1),
follows1=as.numeric(test$follows1),
likes1=as.numeric(test$likes1),
reblogs1=as.numeric(test$reblogs1),
original_posts1=as.numeric(test$original_posts1),
searches1=as.numeric(test$searches1),
unfollows1=as.numeric(test$unfollows1),
received_engagments1=as.numeric(test$received_engagments1),
time1=as.numeric(test$time1),
regi_geo1=as.numeric(test$regi_geo1))

prediction <- knn(train_x, test_x, train_y, k=5)
table(prediction, test$y)

test$score = as.integer(prediction)
#calculate auc;
pred = prediction(test$score, test$y);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values); auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
abline(0, 1, lty = 2);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

p <- ggplot(test, aes(pageviews1, follows1))
p + geom_point(aes(color=test$y))

##################################################################
## Support Vector Machines
##################################################################
## 0.6148956 0.2297911
svm <- svm(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
+ searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train,
method = "C-classification", kernel = "radial", cost = 100, gamma = 1)
test$score <- predict(svm, test)
AUC(test$score, test$y)
KS(test$score, test$y)

## 0.621205 0.2424101
svm <- svm(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
+ searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train,
method = "C-classification", kernel = "radial", cost = 1, gamma = 1)
test$score <- predict(svm, test)
AUC(test$score, test$y)
KS(test$score, test$y)

##################################################################
## Artificial Neural Networks
##################################################################
train$y=as.integer(train$y)
train$y1=1-train$y
train$is_verified=as.integer(train$is_verified)
train$time1=as.integer(train$time1)
train$regi_geo1=as.integer(train$regi_geo1)

nn1 <- neuralnet(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
+ searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1,
data=train, hidden=c(4))
plot(nn1)

test_x <- data.frame(is_verified=as.integer(test$is_verified),
pageviews1=as.integer(test$pageviews1),
follows1=as.integer(test$follows1),
likes1=as.integer(test$likes1),
reblogs1=as.integer(test$reblogs1),
original_posts1=as.integer(test$original_posts1),
searches1=as.integer(test$searches1),
unfollows1=as.integer(test$unfollows1),
received_engagments1=as.integer(test$received_engagments1),
time1=as.integer(test$time1),
regi_geo1=as.integer(test$regi_geo1))
prediction <- compute(nn1, test_x)
prediction <- prediction$net.result

test$score <- prediction[,1]
AUC(test$score, test$y)
KS(test$score, test$y)

##################################################################
## Random Forest
##################################################################
Random forest consists of a number of decision trees. Every node in the decision trees is a condition on a single feature, designed to split the dataset into two so that similar response values end up in the same set. The measure based on which the (locally) optimal condition is chosen is called impurity. For classification, it is typically either Gini impurity or information gain/entropy and for regression trees it is variance. Thus when training a tree, it can be computed how much each feature decreases the weighted impurity in a tree. For a forest, the impurity decrease from each feature can be averaged and the features are ranked according to this measure.

In particular, trees that are grown very deep tend to learn highly irregular patterns: they overfit their training sets, i.e. have low bias, very high variance. Random forests are a way of averaging multiple deep decision trees, trained on different parts of the same training set, with the goal of reducing the variance. This comes at the expense of a small increase in the bias and some loss of interpretability, but generally greatly boosts the performance of the final model.

install.packages('randomForest')
require(randomForest)
(rf <- randomForest(factor(gear) ~ ., data = train, ntree = 10))
plot(rf)
pred <- predict(rf, newdata = test, type = 'class')
prop.table(table(pred, test$gear))

## importance of variables
rf <- randomForest(factor(gear) ~ ., data = train, ntree = 100, importance=T)
varImpPlot(rf, main="Importance of Variables")

##################################################################
## GB Decesion Tree
##################################################################
require("gbm")
gbm_perf <- gbm.perf(gbm_model, method = "cv")
predictions_gbm <- predict(gbm_model, newdata = testdf[, -response_column],
n.trees = gbm_perf, type = "response")

Tuesday, August 19, 2014

Time Series Analysis in R

##################################################################
## Second Order Summaries
##################################################################
The theory for time series is based on the assumption of second-order stationarity after removing any trends (which will include seasonal trends). A time series is said to be stationary if the mean, variance, and lag covariance are constant over time. There are three methods of dealing with non-stationary time series listed above: differencing, using log transformations, using sinusoidal components.

Checking the acf functions to explore the auto-covariance and auto-correlation, make sure a series with a non-unit frequency.

##################################################################
## Checking acf and pacf
##################################################################
par(mfrow = c(1, 2), pty="s")
acf(Sample$embroidered_top)
pacf(Sample$embroidered_top)

The plots of the above series show the pattern typical of seasonal series, and the auto-correlation do not damp down for large lags.

How to identify an AR(p) process
- The ACF tails off exponentially
- The PACF drop to 0 after lag p, where p is the order of the AR process
- If the ACF tails off very slowly, the time series may not be stationary.

How to identify an MA(q) process
- The ACF drops to 0 after lag q, where q is the order of the MA process
- The PACF tail of exponentially

##################################################################
## Spetral analysis
##################################################################

The basic tool in estimating the spectral density is the Periodogram. Checking Periodogram, we effectively compute the squared correlation between the series and sine/cosine waves of frequency. And also, using repeated smoothing with modified Daniell smoothers, which are moving averages giving half weight to the end values of the span. The bandwidth is a measure of the size of smoothing window. Trial-and-error is needed to choose the spans. The spans should be odd integers, and it helps to produce a smooth plot if they are different and at least two are used.

spectrum(Sample$embroidered_top)

## Cumulative periodogram
par(mfrow = c(1, 1), pty="s")
cpgram(logtraindt)

##################################################################
## Fit ARIMA models
##################################################################
ARIMA models are best for short-term forecasting because long-term forecasting is no better than using the mean of the time series.

## split train and test data set
train <- Sample[1:(nrow(Sample)-4)]
test <- Sample[(nrow(Sample)-3):nrow(Sample)]

## transformation
logtraindt <- log(1+train$embroidered_top)/log(100)
logtestdt <- log(1+test$embroidered_top)/log(100)

## AR model investigation
## All likelihood assume a Gaussian distribution. The model with the smallest AIC is chosen.
ar1 <- ar(logtraindt, aic=T, order.max = 10); summary(ar1)
cpgram(ar1$resid, main="AR Max Model")

## case 1

fit <- arima(ts, order=c(1,0,0), list(order=c(2,1,0), period=5))
fore <- predict(fit, n.ahead=15)
# error bounds at 95% confidence level
U <- fore$pred + 2*fore$se
L <- fore$pred - 2*fore$se
ts.plot(ts, fore$pred, U, L, col=c(1,2,4,4), lty = c(1,1,2,2))
legend("topleft", c("Actual", "Forecast", "Error Bounds (95% Confidence)"), col=c(1,2,4), lty=c(1,1,2))

## case 2
## grid search for p,d,q
#for (i in 1:4){
#for (j in 1:4){
for (k in 1:4){
arima1 <- arima(logtraindt, order=c(3,1,k))
print(arima1$aic)
}
arima1 <- arima(logtraindt, order=c(4,4,3))
tsdiag(arima1)
arima1.fore <- predict(arima1, 8)
ts.plot(c(train$embroidered_top,arima1.fore$pred))
ts.plot(arima1.fore$pred,
arima1.fore$pred - 2*arima1.fore$se,
arima1.fore$pred + 2*arima1.fore$se)

##################################################################
## Plot ARIMA model Prediction
##################################################################
train$Date = as.Date(train$Date, "%Y-%m-%d")
pred.dt <- train$Date[length(train$Date)] + seq(1:8)*7
plot_dt=c(train$Date, pred.dt)
plot_pd=c(train$embroidered_top, test$embroidered_top, rep(NA,4))
plot_pd2=c(rep(NA, length(train$embroidered_top)),exp(arima1.fore$pred*log(100))-1)
plot_data=data.table(plot_dt, plot_pd, plot_pd2)
n=nrow(plot_data)
plot_data=plot_data[(n-52):n]

ggplot(plot_data, aes(x=plot_dt, y=plot_pd)) +
geom_point(aes(x=plot_dt, y=plot_pd), col = "blue", size = 1) +
geom_line(aes(x=plot_dt, y=plot_pd), size=1, linetype=1, color="blue") +
geom_line(aes(x=plot_dt, y=plot_pd2), size=1, linetype=4, color="darkgreen") +
geom_vline(xintercept = as.integer(as.Date("2016-10-23", "%Y-%m-%d")), linetype=4) +
scale_x_date(labels=date_format("%b %y"), breaks=date_breaks("12 week")) +
stat_smooth(color="red") +
xlab("Search Index") +
ylab("Date") +
ggtitle("Forcast of 8 Weeks from ARIMA for Embroidered Top") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 14)) +
theme(axis.text.y = element_text(face = "bold", size = 14)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(strip.text.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle=90)) +
guides(fill=FALSE)

##################################################################
### Exponential smoothing
#### Fit in to exponential model
##################################################################

ts.mail <- ts(data$count)
ts.mail.forecast <- HoltWinters(ts.mail , beta=FALSE, gamma=FALSE)
ts.mail.forecast
plot(ts.mail.forecast)
ts.mail.forecast2 <- forecast.HoltWinters(ts.mail.forecast, h=100)
plot(ts.mail.forecast2)

##################################################################
## Double Exponentila with Linear Trends
##################################################################
mHolt <- holt(tslogcnt)
summary(mHolt)
plot(mHolt)
attributes(mHolt)
mHolt$fitted

fit1 <- ses(tslogcnt, h=5)
fit2 <- holt(tslogcnt, h=5)
fit3 <- holt(tslogcnt, exponential=TRUE, h=5)
fit4 <- holt(tslogcnt, damped=TRUE, h=5)
fit5 <- holt(tslogcnt, exponential=TRUE, damped=TRUE, h=5)

plot(fit3, plot.conf=FALSE)
lines(fit1$mean,col=2)
lines(fit2$mean,col=3)
lines(fit4$mean,col=5)
lines(fit5$mean,col=6)
legend("topleft", lty=1, pch=1, col=1:6,
c("Data", "SES", "Holt's", "Exponential", "Additive Damped", "Multiplicative Damped"))
}

##################################################################
## Stationary Test
##################################################################
## stl function to extract components
stl_dau <- stl(ts_dau, s.window="periodic", robust=T)
names(stl_dau)
ts.plot(stl_dau$time.series)

## stationary test
## In ADF test, P-value less that 0.05 to reject the null hypothesis to conclude stationary time series
adf.test(ts_dau)

## de-trend, de-seasonaly
stl_dau <- stl(ts_dau, s.window="periodic", robust=T)
head(stl_dau$time.series)
ts_dau_dt <- ts_dau - stl_dau$time.series[,2]
adf.test(ts_dau_dt)
ts_dau_sa <- ts_dau - stl_dau$time.series[,1]- stl_dau$time.series[,2]
adf.test(ts_dau_sa)

## lags, leads, ACF, PACF and CCF
ts_dau_lag1 <- dplyr::lag(ts_dau_sa, 1)
ts_dau_lead1 <- dplyr::lead(ts_dau_sa, 1)
acf_result <- Acf(ts_dau_sa)
pacf_result <- Pacf(ts_dau_sa)
## ccf(ts1, ts2)

##################################################################
## State Space Model
##################################################################

## tbats

ts_dau <- ts(log(data[,amount]), frequency=365.25, start=c(2015,1,17))

daily_data <- tbats(ts_dau)

summary(daily_data)

t.forecast <- forecast(daily_data, h=14)

plot(t.forecast)

t.forecast

summary(t.forecast)

Box.test(t.forecast$residuals)

Thursday, August 7, 2014

Time Series Analysis - Multivariate Autoregressive State-Space Models

memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=TRUE))
sessionInfo()

require(data.table)
require(stringr)
require(lubridate)
require(scales)

require(ggplot2)
require(gridExtra)
require(ggthemes)

require(MARSS)

## Read in individual data;
setwd("~/Desktop/R/KLAB/11_18_2016/individual")

## Copy results from one directory to current directory
dir.create(file.path("/Users/tkmaemd/Desktop/R/KLAB/11_18_2016/individual", "results"))
file.copy(from="/Users/tkmaemd/Desktop/R/KLAB/11_18_2016/batch/results",
to="/Users/tkmaemd/Desktop/R/KLAB/11_18_2016/individual",
recursive = TRUE, copy.mode = TRUE)

## Read-in raw data
files <- list.files(pattern = ".csv")
files
temp <- lapply(files, fread, sep=",")
data <- rbindlist( temp )
dim(data)

tmp=data[theme=='Dresses' & category=='Details']
Sample = dcast.data.table(tmp, Date + theme + category ~ keyword,value.var="ankle jeans")
names(Sample)=str_replace_all(names(Sample), " ", "_")
Sample <- Sample[, theme:=NULL]
Sample <- Sample[, category:=NULL]
Sample <- Sample[-1]

n=nrow(Sample)
dat = Sample[(n-51):n]
dat = dat[, Date:=NULL]

## model 1: one hidden state and i.i.d obs - R is diagonal and equal
## Fit the single state model, where the time series are assumed to be from the same population.
model1=list()
model1$R="diagonal and equal"
model1$Z=matrix(1,8,1) #matrix of 2 rows and 1 column
model1$A="scaling" #the default

kemfit = MARSS(bb, model=list(Z=matrix(1,2,1),R ="diagonal and equal")) ##2 is used because we have 2 signals
#kemfit = MARSS(bb, model.list)
print(kemfit$model)
summary(kemfit$model)
kem1 = MARSS(t(dat), model=model1)
kem1$AIC
kem1$AICc

# model 2: Fit the single state model, where the time series are assumed to be from the same population with different variance
model2=list()
model2$R="diagonal and unequal"
model2$Z=matrix(1,8,1) #matrix of 2 rows and 1 column
model2$A="scaling" #the default

kem2 = MARSS(t(dat), model=model2)
kem2$AIC
kem2$AICc

output <- Sample[(n-51):n]
output$Date <- as.Date(output$Date,"%Y-%m-%d")
b <- t(kem2$states)####b is the combined index
output$COMBINED <- round(100*b/max(b))
colors=rainbow(8)
ggplot(output, aes(x=Date, y=COMBINED, group = 1)) +
geom_line(aes(x=Date, y=COMBINED), col = 'red', size = 2, linetype=1) +
geom_line(aes(x=Date, y=bandana_print_dress), col = colors[1], size = 1, linetype=4) +
geom_line(aes(x=Date, y=bohemian_dress), col = colors[2], size = 1, linetype=4) +
geom_line(aes(x=Date, y=embroidered_dress), col = colors[3], size = 1, linetype=4) +
geom_line(aes(x=Date, y=floral_dress), col = colors[4], size = 1, linetype=4) +
geom_line(aes(x=Date, y=fringe_dress), col = colors[5] , size = 1, linetype=4) +
geom_line(aes(x=Date, y=paisley_print_dress), col = colors[6], size = 1, linetype=4) +
geom_line(aes(x=Date, y=ruffle_dress), col = colors[7], size = 1, linetype=4) +
geom_line(aes(x=Date, y=tie_dye_dress), size=1, linetype=4, color= colors[8]) +
scale_x_date(labels=date_format("%b %y"), breaks=date_breaks("4 week")) +
xlab("Search Index") +
ylab("Date") +
ggtitle("Combined 8 Index of 52 Weeks from STATE SPACE Model for Dress Detail Category") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 14)) +
theme(axis.text.y = element_text(face = "bold", size = 14)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(strip.text.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle=90)) +
guides(fill=FALSE)

# model 3: Fit a model with different population process with the same process parameters
model3=list()
model3$Q="diagonal and equal"
model3$R="diagonal and equal"
model3$U="equal"
model3$Z="identity"
model3$A="zero"
kem3 = MARSS(t(dat), model=model3)
kem3$AIC
kem3$AICc

Wednesday, July 30, 2014

Shiny-for-Data-Exploration

Data Preparation.R

memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))

library(MARSS)
library(ggplot2)
library(reshape2)
library(data.table)

data <- rbind(data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12)
data$group <- as.factor(data$group)

data<- transform(data, group = factor(group))
data <- transform(data, group = reorder(group, rank(group)))

saveRDS(data, file="data.Rds")

Shiny.R

library(shiny)

setwd("consumer journey");
runApp("Shiny")

ui.R

library(shiny)

# Define UI for dataset viewer application
shinyUI(fluidPage(
titlePanel("Summary Statistics"),

sidebarLayout(
sidebarPanel(
# Copy the line below to make a text input box
textInput("text", label = h3("Please enter column id"), value = "1"),
hr(),
dataTableOutput('mytable')),

mainPanel(
h3(textOutput("caption")),
h3("Positive:"),
verbatimTextOutput("summary1"),
h3("Users:"),
verbatimTextOutput("summary2"),
h3("Response Rates:"),
verbatimTextOutput("summary3"),
plotOutput("plot")
)
)
))

server.R

library(shiny)
library(ggplot2)

setwd("consumer journey");
data <- readRDS("shiny/data/data.Rds")

shinyServer(function(input, output) {
# You can access the value of the widget with input$text, e.g.
# output$value <- renderPrint({ input$text })

# Create a reactive text
text <- reactive({
paste(input$text)
})

output$caption <- renderText({
dat1 <- data[data$id==as.integer(text()),]
dat1$col[1]
})

output$mytable = renderDataTable({
dat1 <- data[data$id==as.integer(text()),]
})

output$summary1 <- renderPrint({
dat1 <- data[data$id==as.integer(text()),]
summary(dat1$positives)
})

output$summary2 <- renderPrint({
dat1 <- data[data$id==as.integer(text()),]
summary(dat1$users)
})

output$summary3 <- renderPrint({
dat1 <- data[data$id==as.integer(text()),]
summary(dat1$rr)
})

# Generate a plot of the requested variables
output$plot <- renderPlot({
dat1 <- data[data$id==as.integer(text()),]
dat1 <- dat1[1:min(50,nrow(dat1)),]
dat2<-dat1
dat2$group <- factor(dat2$group, levels=dat1$group)
p <- ggplot(dat2,aes(x=group, y=rr),environment=environment()) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle("Distribution") +
ylab("First 50 Groups") + xlab("Group") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90))
print(p)
})

})

Wednesday, June 18, 2014

Logistic Regression Model Calibration in R

all <- fread("all.txt", head=F, sep='\t')
names=c("HasClick", "PredScore", "Score");
setnames(all, names);
setkey(all, Score);
all[HasClick==-1, HasClick:=0];
all[,list(Impressions=.N, CTR=sum(HasClick)/.N), by=HasClick][order(-CTR)];
summary(all$HasClick)

#PredScore
all[PredScore==-1, PredScore:=0];
summary(all$PredScor)
pred<-prediction(all$PredScore,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.6567984
#[1] /homes/lingh/all.txt
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.244848

#p Score
all$Score <- all$Score
all$p <- 1/(1+exp(-all$Score))
summary(all$p)
pred<-prediction(all$p,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
#[1] 0.6919405
#[1] 0.6992571
#[1] 0.7176237
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409
#[1] 0.3065308
#[1] 0.3129876
#[1] 0.3337304

# prior for bias correction
totimp <- nrow(all)
totclk <- sum(all$HasClick==1)
t <- totclk/totimp; t
p <-mean(all$p); p
all$p2 <- 1/(1+exp(-all$Score+log((1-t)*p/(t*(1-p)))));
mean(all$p2)
summary(all$p2)
pred<-prediction(all$p2,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
#[1] 0.6919405
[1] 0.6992571
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409
#[1] 0.3065308
[1] 0.3129876

# ensemble
all$p3 <- (all$p2+all$PredScore)/2
summary(all$p3)
pred<-prediction(all$p3,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.6996736
#[1] 0.6784338
[1] 0.6973308
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3098282
#[1] 0.2729899
[1] 0.3105527

clkinbin <- round(totclk/100); clkinbin
clkinbin <- 5
grp <- 1
tmp <- rep(1, totimp)
cnt <- 0
for (i in 1:totimp){
#for (i in 1:100000){
if (all$HasClick[i]==1){
cnt=cnt+1;
}
if (cnt==clkinbin){
grp=grp+1;
cnt=0;
}
tmp[i]=grp
}
all$grp=tmp
table(all$grp)
#all[grp==108, grp:=107];

groups=all[,list(Impressions=.N, Clicks=sum(HasClick), CTR=sum(HasClick)/.N,
AvgScore1=mean(PredScore),
AvgScore2=mean(p),
AvgScore3=mean(p2),
AvgScore4=mean(p3)),
by=grp][order(grp)];
#write.csv(groups, "junk.csv")
with(groups, cor(CTR, AvgScore1))
#[1] 0.8853131
with(groups, cor(CTR, AvgScore2))
#[1] 0.87801
with(groups, cor(CTR, AvgScore3))
#[1] 0.8791485
with(groups, cor(CTR, AvgScore4))
# [1] 0.8803879

data <- groups
ggpairs(data[, c("CTR", "AvgScore1", "AvgScore2", "AvgScore3", "AvgScore4"), with=F])

sum((groups$AvgScore1-groups$CTR)^2)/length(groups$AvgScore1)
#[1] 2.920109e-06
sum((groups$AvgScore2-groups$CTR)^2)/length(groups$AvgScore2)
#[1] 0.0001156954
sum((groups$AvgScore3-groups$CTR)^2)/length(groups$AvgScore3)
#[1] 0.0001410945
#[1] 0.0001240449
sum((groups$AvgScore4-groups$CTR)^2)/length(groups$AvgScore4)
#[1] 4.574389e-05
#[1] 4.09054e-05

data<-groups
ggplot(data, aes(y=data$CTR, x=data$grp)) +
geom_line(data=data, aes(y=data$CTR, x=data$grp), color='black', linetype=1, size = 1) +
geom_line(data=data, aes(y=data$AvgScore1, x=data$grp), color='red', linetype=1, size = 1) +
# geom_line(data=data, aes(y=data$AvgScore2, x=data$grp), color='purple', linetype=1, size = 1) +
geom_line(data=data, aes(y=data$AvgScore3, x=data$grp), color='darkgreen', linetype=1, size = 1) +
geom_line(data=data, aes(y=data$AvgScore4, x=data$grp), color='darkblue', linetype=1, size = 1) +
scale_x_continuous(limits=c(1, 100)) +
scale_y_continuous(limits=c(0, 0.005)) +
ggtitle("Binning Avg of CTR, HModel and VWModel") +
xlab("Bin") + ylab("CTR") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank())

# fit linear model
data<-groups[1:107]
with(data, cor(CTR, AvgScore1))
with(data, cor(CTR, AvgScore3))
m <- lm(CTR ~ AvgScore3, data = groups)
data$tmp <- 0.0005360169 + 0.0717973293*data$AvgScore3
sum((data$tmp-data$CTR)^2)/length(data$tmp)
# [1] 2.400827e-07

# fit tobit model
summary(m <- vglm(CTR ~ AvgScore3, tobit(Upper = 0.05), data = groups))
ctable <- coef(summary(m))
pvals <- 2 * pt(abs(ctable[, "z value"]), df.residual(m), lower.tail = FALSE)
cbind(ctable, pvals)
data$tmp <- 0.0005360131 + 0.0717974011*data$AvgScore3
sum((data$tmp-data$CTR)^2)/length(data$tmp)

# fit loess smoother
m <- loess(CTR ~ AvgScore3, data = groups)
groups$tmp <- m$fitted
sum((groups$tmp-groups$CTR)^2)/length(groups$tmp)
# [1] 1.765797e-07

# fit Iso
install.packages("Iso")
require(Iso)
z = ufit(groups$CTR, groups$AvgScore3, type='b')
z = pava(groups$AvgScore3)
groups$tmp <- z
sum((groups$tmp-groups$CTR)^2)/length(groups$tmp)
#0.0001410945

data <- groups[1:105]
ggplot(data, aes(y=data$CTR, x=data$AvgScore3)) +
geom_point(data=data, aes(y=data$CTR, x=data$AvgScore3), color='blue', shape=3, size = 3) +
geom_line(data=data, aes(y=data$AvgScore3, x=data$AvgScore3), color='black', linetype=3, size = 1) +
scale_x_continuous(limits=c(0, 0.0015)) +
scale_y_continuous(limits=c(0, 0.0015)) +
ggtitle("Distribution of CTR Vs LRScore") +
xlab("LRScore") + ylab("CTR") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank())

groups$nonClicks <- groups$Impressions-groups$Clicks
zfit <- glm(cbind(groups$Clicks, groups$nonClicks ) ~ + data1$AvgScore3 , family = binomial, data=groups);
summary(lrfit);

glm.out = glm(cbind(Clicks, nonclick) ~ AvgScore2, family=binomial(logit), data=data)
plot(Clicks/Impressions ~ AvgScore2, data=data)
lines(data$AvgScore2, glm.out$fitted, type="l", col="red")
title(main="Group Data with Fitted Logistic Regression Line")

all$p3<-1/(1+exp(-all$Score*7.16219+8.82040))
summary(all$p3)
pred<-prediction(all$p3,all$HasClick);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values);
auc
#[1] 0.7015954
perf <- performance(pred,"tpr","fpr");
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]]);
#[1] 0.3135409

Logistic Regression with Penalty Factor in R

The main goal of penalty is to minimize the sum of the unpenalized cost function plus the penalty term, which can be understood as adding bias and preferring a simpler model to reduce the variance in the absence of sufficient training data to fit the model.

- The ridge penalty shrinks the coefficients of correlated predictors towards each other (L2)
- The lasso tends to pick one of them and discard the others (L1)
- The elastic-net penalty mixes these two.

If predictors are correlated in groups, an

α

α

α

is for numerical stability; for example, the elastic net with

α = 1 - ϵ

for some small

ϵ > 0

performs much like the lasso, but removes any degeneracies and wild behavior caused by extreme correlations.

## http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html#log
install.packages("glmnet", repos = "http://cran.us.r-project.org")
require(glmnet)

load("BinomialExample.RData")
fit = glmnet(x, y, family = "binomial")
summary(fit)
plot(fit, xvar = "dev", label = TRUE)

predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01))
cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class")
plot(cvfit)
cvfit$lambda.min
cvfit$lambda.1se
coef(cvfit, s = "lambda.min")

predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")

Friday, October 17, 2014

Thursday, October 9, 2014

Friday, October 3, 2014

Friday, September 26, 2014

Friday, September 19, 2014

Monday, September 15, 2014

Tuesday, August 19, 2014

Thursday, August 7, 2014

Wednesday, July 30, 2014

Wednesday, June 18, 2014

Blog Archive