Monday, September 15, 2014

Cluster Analysis in R

memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))

library(MARSS)
library(ggplot2)
library(reshape2)
library(data.table)

#############################
## Hierarchical Clustering
#############################
## could generate more features;
### growth <- function(input)
### momentum <- function(input)
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))

## compute distance
d <- dist(data[,c(5,6,7)])
m <- nrow(data)
as.matrix(d)[1:5, 1:5]

## cluster methods
hc_cluster <- hclust(d, method="complete")
plot(hc_cluster)
rect.hclust(hc_cluster, k=5, border="red")
data$segments <- cutree(hc_cluster, k=5)

## examine results
ddply(data, c('segments'), summarise,
      n=length(growth),
      growthmean=mean(growth),
      momentummean=mean(momentum))

rect <- data.table(ymin=round(min(data$growth),2),
                   ymax=round(max(data$growth),2),
                   xmin=round(min(data$momentum),2),
                   xmax=round(max(data$momentum),2))

pdf("~/Desktop/plot3.pdf", width=15, height=8.5)
ggplot(data, aes(y=growth, x=momentum, size=scalecnt, label=style), guide=FALSE) +
  geom_point(position = "jitter", colour= "red", fill= "red", shape=21) +
  geom_text_repel(aes(y = growth, x = momentum, color=factor(segments)), size=5, angle = -15) +
  theme_bw() +
  ggtitle("Cluster Analysis") +
  xlab("Growth") + ylab("Momemtum") +
  scale_y_continuous(name="Growth", labels=scales::percent, limits=c(-max(abs(rect$ymin),abs(rect$ymax))*1.4, max(abs(rect$ymin),abs(rect$ymax))*1.4)) +
  scale_x_continuous(name="Momentum", labels=scales::percent, limits=c(-max(abs(rect$xmin),abs(rect$xmax))*1.4, max(abs(rect$xmin),abs(rect$xmax))*1.4)) +
  theme(plot.title = element_text(face = "bold", size = 16)) +
  theme(axis.text.x = element_text(face = "bold", size = 10)) +
  theme(axis.text.y = element_text(face = "bold", size = 10)) +
  theme(axis.title.x = element_text(face = "bold", size = 10)) +
  theme(axis.title.y = element_text(face = "bold", size = 10, angle = 90)) +
  theme(legend.position = 'bottom') +
  theme(legend.text = element_text(family = "sans", face = "bold", size = 10)) +
  theme(legend.title = element_text(family = "sans", face = "bold", size = 10)) +
  guides(size=guide_legend("Search Volume Index")) +
  geom_vline(xintercept=c(0), linetype="dotted") +
  geom_hline(yintercept=c(0), linetype="dotted")
dev.off()


#############################
## k_mean Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
kmean_cluster <- kmeans(data[,c(5,6,7)], centers=5)
summary(kmean_cluster)
data$segments = kmean_cluster$cluster
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")

##Example2
data <-fread("shopper.txt")
setnames(data, c("id","lable","ageBucket","gender","country","dvs","pvs","adviews","adclicks",
"allnumevents","wk1numevents","ratio_wk1numevents_all","alldv","wk1dv","ratio_wk1dv_all",
"itemcnt","itemtot","item_cnt_30day","item_tot_30day","item_cnt_30dayPlus","item_tot_30dayPlus"));

> dim(data)
[1] 1083701      21

data$rowid <- 1:nrow(data)

```
## Model One - Item Count
## Drop Outliers

```
data2 <- data[, c(9,20,22), with = F]

> summary(data2)

> quantile(data2$adclicks, c(.99,.999,.9999))
   99%  99.9% 99.99%
  6.00  45.00 159.89
> sum(data$adclicks>160)
[1] 109

data2 <- data2[data2$adclicks<160]
> summary(data2)

> quantile(data2$item_cnt_30dayPlus, c(.99,.999,.9999))
   99%  99.9% 99.99%
    46    129    335
> sum(data2$item_cnt_30dayPlus>335)
[1] 108

data2 <- data2[data2$item_cnt_30dayPlus<335]

```

## Log - Transform data

Apply log(x+1)/log(max(x)) transformation to generate new data
```
func1 <- function(x){
return(log(x+1)/log(max(x)))
}

data3 <- data2[, c(1,2,3), with = F]
data3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]


```

## Scale Transformed data

```
func2 <- function(x){
return( (x-mean(x))/sqrt(sum((x-mean(x))^2)/(length(x)-1)))
}

 data4 <- data3[, c(1,2,3), with = F]
 data4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]

```

## Partition by k-means using scaled transformed data
```
data5 <- data4[,c(1,2), with = F]
results <- data5
for (i in c(2:2)){
  print('----------------------------------------------------------');
  print(i);
  set.seed(33);
  model <- kmeans(data5, i);
  print("model size");
  print(cbind(1:i,model$size));
  print("model centers")
  print(model$centers);
  results<- cbind(results, model$cluster);
}

[1] "----------------------------------------------------------"
[1] 2
[1] "model size"
     [,1]   [,2]
[1,]    1 667941
[2,]    2 415540
[1] "model centers"
     adclicks item_cnt_30dayPlus
1 -0.09521215         -0.6450849
2  0.15304447          1.0369126


data2$cluster = results$V2
data2[,list(users = .N,
                    group_adclicks=sum(adclicks)/.N,
                    group_item_cnt_30dayPlus=sum(item_cnt_30dayPlus)/.N), by='cluster'];
```


## Model Two - Item Total
## Drop Outliers

```
data_2 <- data[, c(9,21,22), with = F]
> head(data_2)
> summary(data_2)
    adclicks         item_tot_30dayPlus
 Min.   :   0.0000   Min.   :     0.00
 1st Qu.:   0.0000   1st Qu.:    23.94
 Median :   0.0000   Median :    65.70
 Mean   :   0.5131   Mean   :   172.36
 3rd Qu.:   0.0000   3rd Qu.:   176.10
 Max.   :1638.0000   Max.   :225247.28

> quantile(data_2$adclicks, c(.99,.999,.9999))
   99%  99.9% 99.99%
  6.00  45.00 159.89
> sum(data_2$adclicks>160)

data_2 <- data_2[data_2$adclicks<160]

> summary(data_2)

> quantile(data_2$item_tot_30dayPlus, c(.99,.999,.9999))
      99%     99.9%    99.99%
 1510.772  4389.105 16171.602

> sum(data_2$item_tot_30dayPlus>16171)
[1] 109

data_2 <- data_2[data_2$item_tot_30dayPlus<16171]

> summary(data_2)
```

## Log - Transform data

Apply log(x+1)/log(max(x)) transformation to generate new data
```

data_3 <- data_2[, c(1,2,3), with = F]
data_3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]

```

## Scale Transformed data

```
 data_4 <- data_3[, c(1,2,3), with = F]
 data_4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]

```

## Partition by k-means using scaled transformed data
```

data_5 <- data_4[,c(1,2), with = F]
results_ <- data_5

for (i in c(2:2)){
  print('----------------------------------------------------------');
  print(i);
  set.seed(33);
  model <- kmeans(data_5, i);
  print("model size");
  print(cbind(1:i,model$size));
  print("model centers")
  print(model$centers);
  results_<- cbind(results_, model$cluster);
}

[1] "----------------------------------------------------------"
[1] 2
[1] "model size"
     [,1]   [,2]
[1,]    1 545531
[2,]    2 537952
[1] "model centers"
    adclicks item_tot_30dayPlus
1  0.1732808          0.7845828
2 -0.1757221         -0.7956365


data_2$cluster = 3-results_$V2
data_2[,list(users = .N,
                    group_adclicks=sum(adclicks)/.N,
                    group_item_tot_30dayPlus=sum(item_tot_30dayPlus)/.N), by='cluster'];
                 
```

## Merge Two Results

```
setkey(data2, rowid);
setkey(data_2, rowid);

comb = merge(data2, data_2, by ="rowid", all = TRUE)
comb1 = comb[,c(4,7),with=F]
> m <- table(comb1,useNA = "always"); m
         cluster.y
cluster.x      1      2   <NA>
     1    464209 203732      0
     2     73742 341718     80
     <NA>      1     81      0
> round(prop.table(m),5)
         cluster.y
cluster.x       1       2    <NA>
     1    0.42841 0.18802 0.00000
     2    0.06806 0.31537 0.00007
     <NA> 0.00000 0.00007 0.00000
```

Number of cluster
# Important packages (libraries)  

library(cluster) # Data set votes.repub, and cluster technique 
library(mclust) # Clustering by mixtures of Gaussians 
library(MASS) # kMeans clustering and others 
library(kernlab) # Kernel kMeans, spectral clustering 

# Function for finding within cluster sum of squares when K=1 

WCSS1 <- function(x) 
return(ifelse(ncol(x)>1,sum(rowSums((x - matrix(rep(apply(x,2,mean), 
times=nrow(x)), byrow=T, ncol=2))^2)),(nrow(x)-1)*var(x))) 

# Function for finding and plotting the number of cluster 

nb.clust <- function(x) 
K <- 7 # The maximum number of clusters 
n <- nrow(x) # sample size 
wcss <- matrix(WCSS1(x), ncol=1, nrow=K) 
for (k in 2:K) 
wcss[k] <- sum(kmeans(x, centers=k)$withinss) 

plot(1:K, wcss, type="b", xlab="Number of Clusters", 
ylab="Within cluster sum of squares") 

dw <- abs(diff(wcss[-1])) 
return(min(which(cumsum(dw/sum(dw))>0.8))) 

> nb.clust(faithful) 
[1] 3 
> kmeans.faithful <- kmeans(faithful,3) 
> kmeans.faithful$centers 

> x11() 
> plot(faithful, col = kmeans.faithful$cluster) 
> points(kmeans.faithful$centers, col = 1:3, pch = 8, cex=6) 

# Imputing 

library(DMwR) 
xvotes <- centralImputation(votes.repub) 
kmeans(xvotes, 2) 
# Note that n remains unchanged 

# Exclusion 

votes# x <- na.exclude(votes.repub) 
kmeans(votesx, 2) 
# Note that the sample size is hugely reduced 
# This can be very very bad 

# What difference 


kmeans can handle large q small n


#############################
##Gaussian Mixture Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
X <- data[,c(5,6,7)]
BIC = mclustBIC(X)
plot(BIC)
m_cluster <- Mclust(X, x = BIC)
plot(m_cluster, what = "classification")
data$segments = m_cluster$classification
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")


No comments:

Post a Comment