Monday, February 8, 2016

Master R 10 - Classification and Clustering

Cluster analysis

# Hierarchical clustering

d <- dist(mtcars)
h <- hclust(d)
h
plot(h)
rect.hclust(h, k=3, border = "red")
cn <- cutree(h, k=3)
table(cn)
round(aggregate(mtcars, FUN = mean, by = list(cn)), 1)
round(aggregate(mtcars, FUN = sd, by = list(cn)), 1)
round(sapply(mtcars, sd), 1)
round(apply(aggregate(mtcars, FUN = mean, by = list(cn)),2, sd), 1)

# Determining the ideal number of clusters

install.packages('NbClust')
library(NbClust)
NbClust(mtcars, method = 'complete', index = 'dindex')
NbClust(mtcars, method = 'complete', index = 'hartigan')$Best.nc
NbClust(mtcars, method = 'complete', index = 'kl')$Best.nc
NbClust(iris[, -5], method = 'complete', index = 'all')$Best.nc[1,]

# K-means clustering

(k <- kmeans(mtcars, 3))
all(cn == k$cluster)

# Visualizing clusters
library(cluster) 
clusplot(mtcars, k$cluster, color = TRUE, shade = TRUE, labels = 2)


# Latent class models
factors <- c('cyl', 'vs', 'am', 'carb', 'gear')
mtcars[, factors] <- lapply(mtcars[, factors], factor)

# Latent Class Analysis
install.packages('poLCA')
library(poLCA)
p <- poLCA(cbind(cyl, vs, am, carb, gear) ~ 1,data = mtcars, graphs = TRUE, nclass = 3)
p$P

# Latent class regression
# Discriminant analysis
rm(mtcars)
mtcars$gear <- factor(mtcars$gear)
library(MASS)
d <- lda(gear ~ ., data = mtcars, CV =TRUE)
(tab <- table(mtcars$gear, d$class))
sum(diag(tab)) / sum(tab)
round(d$posterior, 4)
d <- lda(gear ~ ., data = mtcars)
plot(d)
plot(d, dimen = 1, type = "both" )


# Logistic regression
lr <- glm(am ~ hp + wt, data = mtcars, family = binomial)
summary(lr)
table(mtcars$am, round(predict(lr, type = 'response')))

install.packages('nnet')
library(nnet) 
(mlr <- multinom(factor(gear) ~ ., data = mtcars)) 
table(mtcars$gear, predict(mlr))
rm(mtcars)


# Machine learning algorithms
# The K-Nearest Neighbors algorithm

#K-Nearest Neighbors is a supervised classification algorithm, which is a mostly used in pattern recognition and business analytics. A big advantage of k-NN is that it is not sensitive to outliers, and the usage is extremely straightforward.

set.seed(42)
n     <- nrow(mtcars)
train <- mtcars[sample(n, n/2), ]

library(dplyr)
train <- sample_n(mtcars, n / 2)
test <- mtcars[setdiff(row.names(mtcars), row.names(train)), ]
library(class)
(cm <- knn(
  train = subset(train, select = -gear),
  test  = subset(test, select = -gear),
  cl    = train$gear,
  k     = 5))
cor(test$gear, as.numeric(as.character(cm)))
table(train$gear)

# Classification trees
library(rpart)
ct <- rpart(factor(gear) ~ ., data = train, minsplit = 3)
summary(ct)
plot(ct)
text(ct)
table(test$gear, predict(ct, newdata = test, type = 'class'))

install.packages('party')
library(party)
ct <- ctree(factor(gear) ~ drat, data = train, controls = ctree_control(minsplit = 3)) 
plot(ct, main = "Conditional Inference Tree")
table(test$gear, predict(ct, newdata = test, type = 'node'))

install.packages('randomForest')
library(randomForest)
(rf <- randomForest(factor(gear) ~ ., data = train, ntree = 250))
table(test$gear, predict(rf, test))   
plot(rf)
legend('topright', legend = colnames(rf$err.rate), col    = 1:4, fill   = 1:4, bty    = 'n')

# Other algorithms
install.packages(c('caret','c50'))
library(caret)
library(C50)
C50 <- train(factor(gear) ~ ., data = train, method = 'C5.0')
summary(C50)
table(test$gear, predict(C50, test))

No comments:

Post a Comment

Blog Archive