My DS Coding Bolg: Cluster Analysis in R

memory.limit()

memory.size(max = TRUE)

rm(list=ls(all=T))

library(MARSS)

library(ggplot2)

library(reshape2)

library(data.table)

#############################
## Hierarchical Clustering
#############################
## could generate more features;
### growth <- function(input)
### momentum <- function(input)
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))

## compute distance
d <- dist(data[,c(5,6,7)])
m <- nrow(data)
as.matrix(d)[1:5, 1:5]

## cluster methods
hc_cluster <- hclust(d, method="complete")
plot(hc_cluster)
rect.hclust(hc_cluster, k=5, border="red")
data$segments <- cutree(hc_cluster, k=5)

## examine results
ddply(data, c('segments'), summarise,
n=length(growth),
growthmean=mean(growth),
momentummean=mean(momentum))

rect <- data.table(ymin=round(min(data$growth),2),
ymax=round(max(data$growth),2),
xmin=round(min(data$momentum),2),
xmax=round(max(data$momentum),2))

pdf("~/Desktop/plot3.pdf", width=15, height=8.5)
ggplot(data, aes(y=growth, x=momentum, size=scalecnt, label=style), guide=FALSE) +
geom_point(position = "jitter", colour= "red", fill= "red", shape=21) +
geom_text_repel(aes(y = growth, x = momentum, color=factor(segments)), size=5, angle = -15) +
theme_bw() +
ggtitle("Cluster Analysis") +
xlab("Growth") + ylab("Momemtum") +
scale_y_continuous(name="Growth", labels=scales::percent, limits=c(-max(abs(rect$ymin),abs(rect$ymax))*1.4, max(abs(rect$ymin),abs(rect$ymax))*1.4)) +
scale_x_continuous(name="Momentum", labels=scales::percent, limits=c(-max(abs(rect$xmin),abs(rect$xmax))*1.4, max(abs(rect$xmin),abs(rect$xmax))*1.4)) +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 10)) +
theme(axis.text.y = element_text(face = "bold", size = 10)) +
theme(axis.title.x = element_text(face = "bold", size = 10)) +
theme(axis.title.y = element_text(face = "bold", size = 10, angle = 90)) +
theme(legend.position = 'bottom') +
theme(legend.text = element_text(family = "sans", face = "bold", size = 10)) +
theme(legend.title = element_text(family = "sans", face = "bold", size = 10)) +
guides(size=guide_legend("Search Volume Index")) +
geom_vline(xintercept=c(0), linetype="dotted") +
geom_hline(yintercept=c(0), linetype="dotted")
dev.off()

#############################
## k_mean Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
kmean_cluster <- kmeans(data[,c(5,6,7)], centers=5)
summary(kmean_cluster)
data$segments = kmean_cluster$cluster
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")

##Example2

data <-fread("shopper.txt")

setnames(data, c("id","lable","ageBucket","gender","country","dvs","pvs","adviews","adclicks",

"allnumevents","wk1numevents","ratio_wk1numevents_all","alldv","wk1dv","ratio_wk1dv_all",

"itemcnt","itemtot","item_cnt_30day","item_tot_30day","item_cnt_30dayPlus","item_tot_30dayPlus"));

> dim(data)

[1] 1083701 21

data$rowid <- 1:nrow(data)

```

## Model One - Item Count

## Drop Outliers

```

data2 <- data[, c(9,20,22), with = F]

> summary(data2)

> quantile(data2$adclicks, c(.99,.999,.9999))

99% 99.9% 99.99%

6.00 45.00 159.89

> sum(data$adclicks>160)

[1] 109

data2 <- data2[data2$adclicks<160]

> summary(data2)

> quantile(data2$item_cnt_30dayPlus, c(.99,.999,.9999))

99% 99.9% 99.99%

46 129 335

> sum(data2$item_cnt_30dayPlus>335)

[1] 108

data2 <- data2[data2$item_cnt_30dayPlus<335]

```

## Log - Transform data

Apply log(x+1)/log(max(x)) transformation to generate new data

```

func1 <- function(x){

return(log(x+1)/log(max(x)))

}

data3 <- data2[, c(1,2,3), with = F]

data3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]

```

## Scale Transformed data

```

func2 <- function(x){

return( (x-mean(x))/sqrt(sum((x-mean(x))^2)/(length(x)-1)))

}

data4 <- data3[, c(1,2,3), with = F]

data4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]

```

## Partition by k-means using scaled transformed data

```

data5 <- data4[,c(1,2), with = F]

results <- data5

for (i in c(2:2)){

print('----------------------------------------------------------');

print(i);

set.seed(33);

model <- kmeans(data5, i);

print("model size");

print(cbind(1:i,model$size));

print("model centers")

print(model$centers);

results<- cbind(results, model$cluster);

}

[1] "----------------------------------------------------------"

[1] 2

[1] "model size"

[,1] [,2]

[1,] 1 667941

[2,] 2 415540

[1] "model centers"

adclicks item_cnt_30dayPlus

1 -0.09521215 -0.6450849

2 0.15304447 1.0369126

data2$cluster = results$V2

data2[,list(users = .N,

group_adclicks=sum(adclicks)/.N,

group_item_cnt_30dayPlus=sum(item_cnt_30dayPlus)/.N), by='cluster'];

```

## Model Two - Item Total

## Drop Outliers

```

data_2 <- data[, c(9,21,22), with = F]

> head(data_2)

> summary(data_2)

adclicks item_tot_30dayPlus

Min. : 0.0000 Min. : 0.00

1st Qu.: 0.0000 1st Qu.: 23.94

Median : 0.0000 Median : 65.70

Mean : 0.5131 Mean : 172.36

3rd Qu.: 0.0000 3rd Qu.: 176.10

Max. :1638.0000 Max. :225247.28

> quantile(data_2$adclicks, c(.99,.999,.9999))

99% 99.9% 99.99%

6.00 45.00 159.89

> sum(data_2$adclicks>160)

data_2 <- data_2[data_2$adclicks<160]

> summary(data_2)

> quantile(data_2$item_tot_30dayPlus, c(.99,.999,.9999))

99% 99.9% 99.99%

1510.772 4389.105 16171.602

> sum(data_2$item_tot_30dayPlus>16171)

[1] 109

data_2 <- data_2[data_2$item_tot_30dayPlus<16171]

> summary(data_2)

```

## Log - Transform data

Apply log(x+1)/log(max(x)) transformation to generate new data

```

data_3 <- data_2[, c(1,2,3), with = F]

data_3[, 1:2 := lapply(.SD, func1), .SDcols = 1:2]

```

## Scale Transformed data

```

data_4 <- data_3[, c(1,2,3), with = F]

data_4[, 1:2 := lapply(.SD, func2), .SDcols = 1:2]

```

## Partition by k-means using scaled transformed data

```

data_5 <- data_4[,c(1,2), with = F]

results_ <- data_5

for (i in c(2:2)){

print('----------------------------------------------------------');

print(i);

set.seed(33);

model <- kmeans(data_5, i);

print("model size");

print(cbind(1:i,model$size));

print("model centers")

print(model$centers);

results_<- cbind(results_, model$cluster);

}

[1] "----------------------------------------------------------"

[1] 2

[1] "model size"

[,1] [,2]

[1,] 1 545531

[2,] 2 537952

[1] "model centers"

adclicks item_tot_30dayPlus

1 0.1732808 0.7845828

2 -0.1757221 -0.7956365

data_2$cluster = 3-results_$V2

data_2[,list(users = .N,

group_adclicks=sum(adclicks)/.N,

group_item_tot_30dayPlus=sum(item_tot_30dayPlus)/.N), by='cluster'];

```

## Merge Two Results

```

setkey(data2, rowid);

setkey(data_2, rowid);

comb = merge(data2, data_2, by ="rowid", all = TRUE)

comb1 = comb[,c(4,7),with=F]

> m <- table(comb1,useNA = "always"); m

cluster.y

cluster.x 1 2 <NA>

1 464209 203732 0

2 73742 341718 80

<NA> 1 81 0

> round(prop.table(m),5)

cluster.y

cluster.x 1 2 <NA>

1 0.42841 0.18802 0.00000

2 0.06806 0.31537 0.00007

<NA> 0.00000 0.00007 0.00000

```

Number of cluster

# Important packages (libraries)

library(cluster) # Data set votes.repub, and cluster technique

library(mclust) # Clustering by mixtures of Gaussians

library(MASS) # kMeans clustering and others

library(kernlab) # Kernel kMeans, spectral clustering

# Function for finding within cluster sum of squares when K=1

WCSS1 <- function(x)

{

return(ifelse(ncol(x)>1,sum(rowSums((x - matrix(rep(apply(x,2,mean),

times=nrow(x)), byrow=T, ncol=2))^2)),(nrow(x)-1)*var(x)))

}

# Function for finding and plotting the number of cluster

nb.clust <- function(x)

{

K <- 7 # The maximum number of clusters

n <- nrow(x) # sample size

wcss <- matrix(WCSS1(x), ncol=1, nrow=K)

for (k in 2:K)

{

wcss[k] <- sum(kmeans(x, centers=k)$withinss)

}

plot(1:K, wcss, type="b", xlab="Number of Clusters",

ylab="Within cluster sum of squares")

dw <- abs(diff(wcss[-1]))

return(min(which(cumsum(dw/sum(dw))>0.8)))

}

> nb.clust(faithful)

[1] 3

> kmeans.faithful <- kmeans(faithful,3)

> kmeans.faithful$centers

> x11()

> plot(faithful, col = kmeans.faithful$cluster)

> points(kmeans.faithful$centers, col = 1:3, pch = 8, cex=6)

# Imputing

library(DMwR)

xvotes <- centralImputation(votes.repub)

kmeans(xvotes, 2)

# Note that n remains unchanged

# Exclusion

votes# x <- na.exclude(votes.repub)

kmeans(votesx, 2)

# Note that the sample size is hugely reduced

# This can be very very bad

# What difference

kmeans can handle large q small n

#############################
##Gaussian Mixture Clustering
#############################
data = read.csv("ranking_growth_momentum.csv", header = T)
row.names(data) <- data$style

## transform data
data$logcnt = log(data$cnt)/log(max(data$cnt))
X <- data[,c(5,6,7)]
BIC = mclustBIC(X)
plot(BIC)
m_cluster <- Mclust(X, x = BIC)
plot(m_cluster, what = "classification")
data$segments = m_cluster$classification
boxplot(data$growth ~ data$segments, ylab="Growth", xlab="Segments")
clusplot(data, data$segments, color=T, shade = T, labels=4, lines=0, main="K-Means")

My DS Coding Bolg

Monday, September 15, 2014

Cluster Analysis in R

No comments:

Post a Comment

Blog Archive