memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))
library(MARSS)
library(ggplot2)
library(reshape2)
library(data.table)
#############################
## Hierarchical Clustering
#############################
## could generate more features;
### growth <- function(input)
### momentum <- function(input)
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style
## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
## compute distance
d <- dist(data[,c(5,6,7)])
m <- nrow(data)
as.matrix(d)[1:5, 1:5]
## cluster methods
hc_cluster <- hclust(d, method="complete")
plot(hc_cluster)
rect.hclust(hc_cluster, k=5, border="red")
data$segments <- cutree(hc_cluster, k=5)
## examine results
ddply(data, c('segments'), summarise,
n=length(growth),
growthmean=mean(growth),
momentummean=mean(momentum))
rect <- data.table(ymin=round(min(data$growth),2),
ymax=round(max(data$growth),2),
xmin=round(min(data$momentum),2),
xmax=round(max(data$momentum),2))
pdf("~/Desktop/plot3.pdf", width=15, height=8.5)
ggplot(data, aes(y=growth, x=momentum, size=scalecnt, label=style), guide=FALSE) +
geom_point(position = "jitter", colour= "red", fill= "red", shape=21) +
geom_text_repel(aes(y = growth, x = momentum, color=factor(segments)), size=5, angle = -15) +
theme_bw() +
ggtitle("Cluster Analysis") +
xlab("Growth") + ylab("Momemtum") +
scale_y_continuous(name="Growth", labels=scales::percent, limits=c(-max(abs(rect$ymin),abs(rect$ymax))*1.4, max(abs(rect$ymin),abs(rect$ymax))*1.4)) +
scale_x_continuous(name="Momentum", labels=scales::percent, limits=c(-max(abs(rect$xmin),abs(rect$xmax))*1.4, max(abs(rect$xmin),abs(rect$xmax))*1.4)) +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 10)) +
theme(axis.text.y = element_text(face = "bold", size = 10)) +
theme(axis.title.x = element_text(face = "bold", size = 10)) +
theme(axis.title.y = element_text(face = "bold", size = 10, angle = 90)) +
theme(legend.position = 'bottom') +
theme(legend.text = element_text(family = "sans", face = "bold", size = 10)) +
theme(legend.title = element_text(family = "sans", face = "bold", size = 10)) +
guides(size=guide_legend("Search Volume Index")) +
geom_vline(xintercept=c(0), linetype="dotted") +
geom_hline(yintercept=c(0), linetype="dotted")
dev.off()
#############################
## k_mean Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style
## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
kmean_cluster <- kmeans(data[,c(5,6,7)], centers=5)
summary(kmean_cluster)
data$segments = kmean_cluster$cluster
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")
##Example2
data <-fread("shopper.txt")
setnames(data, c("id","lable","ageBucket","gender","country","dvs","pvs","adviews","adclicks",
"allnumevents","wk1numevents","ratio_wk1numevents_all","alldv","wk1dv","ratio_wk1dv_all",
"itemcnt","itemtot","item_cnt_30day","item_tot_30day","item_cnt_30dayPlus","item_tot_30dayPlus"));
> dim(data)
[1] 1083701 21
data$rowid <- 1:nrow(data)
```
## Model One - Item Count
## Drop Outliers
```
data2 <- data[, c(9,20,22), with = F]
> summary(data2)
> quantile(data2$adclicks, c(.99,.999,.9999))
99% 99.9% 99.99%
6.00 45.00 159.89
> sum(data$adclicks>160)
[1] 109
data2 <- data2[data2$adclicks<160]
> summary(data2)
> quantile(data2$item_cnt_30dayPlus, c(.99,.999,.9999))
99% 99.9% 99.99%
46 129 335
> sum(data2$item_cnt_30dayPlus>335)
[1] 108
data2 <- data2[data2$item_cnt_30dayPlus<335]
```
## Log - Transform data
Apply log(x+1)/log(max(x)) transformation to generate new data
```
func1 <- function(x){
return(log(x+1)/log(max(x)))
}
data3 <- data2[, c(1,2,3), with = F]
data3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]
```
## Scale Transformed data
```
func2 <- function(x){
return( (x-mean(x))/sqrt(sum((x-mean(x))^2)/(length(x)-1)))
}
data4 <- data3[, c(1,2,3), with = F]
data4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]
```
## Partition by k-means using scaled transformed data
```
data5 <- data4[,c(1,2), with = F]
results <- data5
for (i in c(2:2)){
print('----------------------------------------------------------');
print(i);
set.seed(33);
model <- kmeans(data5, i);
print("model size");
print(cbind(1:i,model$size));
print("model centers")
print(model$centers);
results<- cbind(results, model$cluster);
}
[1] "----------------------------------------------------------"
[1] 2
[1] "model size"
[,1] [,2]
[1,] 1 667941
[2,] 2 415540
[1] "model centers"
adclicks item_cnt_30dayPlus
1 -0.09521215 -0.6450849
2 0.15304447 1.0369126
data2$cluster = results$V2
data2[,list(users = .N,
group_adclicks=sum(adclicks)/.N,
group_item_cnt_30dayPlus=sum(item_cnt_30dayPlus)/.N), by='cluster'];
```
## Model Two - Item Total
## Drop Outliers
```
data_2 <- data[, c(9,21,22), with = F]
> head(data_2)
> summary(data_2)
adclicks item_tot_30dayPlus
Min. : 0.0000 Min. : 0.00
1st Qu.: 0.0000 1st Qu.: 23.94
Median : 0.0000 Median : 65.70
Mean : 0.5131 Mean : 172.36
3rd Qu.: 0.0000 3rd Qu.: 176.10
Max. :1638.0000 Max. :225247.28
> quantile(data_2$adclicks, c(.99,.999,.9999))
99% 99.9% 99.99%
6.00 45.00 159.89
> sum(data_2$adclicks>160)
data_2 <- data_2[data_2$adclicks<160]
> summary(data_2)
> quantile(data_2$item_tot_30dayPlus, c(.99,.999,.9999))
99% 99.9% 99.99%
1510.772 4389.105 16171.602
> sum(data_2$item_tot_30dayPlus>16171)
[1] 109
data_2 <- data_2[data_2$item_tot_30dayPlus<16171]
> summary(data_2)
```
## Log - Transform data
Apply log(x+1)/log(max(x)) transformation to generate new data
```
data_3 <- data_2[, c(1,2,3), with = F]
data_3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]
```
## Scale Transformed data
```
data_4 <- data_3[, c(1,2,3), with = F]
data_4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]
```
## Partition by k-means using scaled transformed data
```
data_5 <- data_4[,c(1,2), with = F]
results_ <- data_5
for (i in c(2:2)){
print('----------------------------------------------------------');
print(i);
set.seed(33);
model <- kmeans(data_5, i);
print("model size");
print(cbind(1:i,model$size));
print("model centers")
print(model$centers);
results_<- cbind(results_, model$cluster);
}
[1] "----------------------------------------------------------"
[1] 2
[1] "model size"
[,1] [,2]
[1,] 1 545531
[2,] 2 537952
[1] "model centers"
adclicks item_tot_30dayPlus
1 0.1732808 0.7845828
2 -0.1757221 -0.7956365
data_2$cluster = 3-results_$V2
data_2[,list(users = .N,
group_adclicks=sum(adclicks)/.N,
group_item_tot_30dayPlus=sum(item_tot_30dayPlus)/.N), by='cluster'];
```
## Merge Two Results
```
setkey(data2, rowid);
setkey(data_2, rowid);
comb = merge(data2, data_2, by ="rowid", all = TRUE)
comb1 = comb[,c(4,7),with=F]
> m <- table(comb1,useNA = "always"); m
cluster.y
cluster.x 1 2 <NA>
1 464209 203732 0
2 73742 341718 80
<NA> 1 81 0
> round(prop.table(m),5)
cluster.y
cluster.x 1 2 <NA>
1 0.42841 0.18802 0.00000
2 0.06806 0.31537 0.00007
<NA> 0.00000 0.00007 0.00000
```
Number of cluster
# Important packages (libraries)
library(cluster) # Data set votes.repub, and cluster technique
library(mclust) # Clustering by mixtures of Gaussians
library(MASS) # kMeans clustering and others
library(kernlab) # Kernel kMeans, spectral clustering
# Function for finding within cluster sum of squares when K=1
WCSS1 <- function(x)
{
return(ifelse(ncol(x)>1,sum(rowSums((x - matrix(rep(apply(x,2,mean),
times=nrow(x)), byrow=T, ncol=2))^2)),(nrow(x)-1)*var(x)))
}
# Function for finding and plotting the number of cluster
nb.clust <- function(x)
{
K <- 7 # The maximum number of clusters
n <- nrow(x) # sample size
wcss <- matrix(WCSS1(x), ncol=1, nrow=K)
for (k in 2:K)
{
wcss[k] <- sum(kmeans(x, centers=k)$withinss)
}
plot(1:K, wcss, type="b", xlab="Number of Clusters",
ylab="Within cluster sum of squares")
dw <- abs(diff(wcss[-1]))
return(min(which(cumsum(dw/sum(dw))>0.8)))
}
> nb.clust(faithful)
[1] 3
> kmeans.faithful <- kmeans(faithful,3)
> kmeans.faithful$centers
> x11()
> plot(faithful, col = kmeans.faithful$cluster)
> points(kmeans.faithful$centers, col = 1:3, pch = 8, cex=6)
# Imputing
library(DMwR)
xvotes <- centralImputation(votes.repub)
kmeans(xvotes, 2)
# Note that n remains unchanged
# Exclusion
votes# x <- na.exclude(votes.repub)
kmeans(votesx, 2)
# Note that the sample size is hugely reduced
# This can be very very bad
# What difference
kmeans can handle large q small n
#############################
##Gaussian Mixture Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style
## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
X <- data[,c(5,6,7)]
BIC = mclustBIC(X)
plot(BIC)
m_cluster <- Mclust(X, x = BIC)
plot(m_cluster, what = "classification")
data$segments = m_cluster$classification
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")