Saturday, February 6, 2016

Master R 9 - From Big to Small Data

Adequacy tests

Normality

library(hlfights)
JFK <- hflights[which(hflights$Dest == 'JFK'), c('TaxiIn', 'TaxiOut')]
JFK <- subset(hflights, Dest == 'JFK', select = c(TaxiIn, TaxiOut))

par(mfrow = c(1, 2))
qqnorm(JFK$TaxiIn, ylab = 'TaxiIn')
qqline(JFK$TaxiIn)
qqnorm(JFK$TaxiOut, ylab = 'TaxiOut')
qqline(JFK$TaxiOut)

shapiro.test(JFK$TaxiIn)

Multivariate normality

JFK <- na.omit(JFK)
library(MVN)
mardiaTest(JFK)
hzTest(JFK)
roystonTest(JFK)
par(mfrow = c(1, 2))
mvnPlot(mvt, type = "contour", default = TRUE)
mvnPlot(mvt, type = "persp", default = TRUE)

set.seed(42)
mvt <- roystonTest(MASS::mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), 2)))
mvnPlot(mvt, type = "contour", default = TRUE)
mvnPlot(mvt, type = "persp", default = TRUE)


Dependence of variables

hflights_numeric <- hflights[, which(sapply(hflights, is.numeric))]
cor(hflights_numeric, use = "pairwise.complete.obs")
str(cor(hflights_numeric, use = "pairwise.complete.obs"))
hflights_numeric <- hflights[,which(sapply(hflights, function(x) is.numeric(x) && var(x, na.rm = TRUE) != 0))]
table(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")))

library(ellipse)
plotcorr(cor(hflights_numeric, use = "pairwise.complete.obs"))
plotcorr(cor(data.frame(
1:10,
1:10 + runif(10),
1:10 + runif(10) * 5,
runif(10),
10:1,
check.names = FALSE)))
cor(hflights$FlightNum, hflights$Month)

KMO and Barlett's test

library(psych)
KMO(cor(hflights_numeric, use = "pairwise.complete.obs"))
cor(hflights_numeric[, c('Cancelled', 'AirTime')])
cancelled <- which(hflights_numeric$Cancelled == 1)
table(hflights_numeric$AirTime[cancelled], exclude = NULL)
table(hflights_numeric$Cancelled)
hflights_numeric <- subset(hflights_numeric, select = -Cancelled)
which(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")), arr.ind = TRUE)
hflights_numeric <- subset(hflights_numeric, select = -Diverted)
KMO(cor(hflights_numeric[, -c(14)], use = "pairwise.complete.obs"))
KMO(mtcars)
cortest.bartlett(cor(mtcars))


Principal Component Analysis

PCA algorithms
prcomp(mtcars, scale = TRUE)
summary(prcomp(mtcars, scale = TRUE))
sum(prcomp(scale(mtcars))$sdev^2)

Determining the number of components

prcomp(scale(mtcars))$sdev^2
VSS.scree(cor(mtcars))
scree(cor(mtcars))
fa.parallel(mtcars)

Interpreting components

pc <- prcomp(mtcars, scale = TRUE)
head(pc$x[, 1:2])
head(scale(mtcars) %*% pc$rotation[, 1:2])
summary(pc$x[, 1:2])
apply(pc$x[, 1:2], 2, sd)
round(cor(pc$x))
pc$rotation[, 1:2]

biplot(pc, cex = c(0.8, 1.2))
abline(h = 0, v = 0, lty = 'dashed')
cor(mtcars, pc$x[, 1:2])

Rotation methods

varimax(pc$rotation[, 1:2])
pcv <- varimax(pc$rotation[, 1:2])$loadings
plot(scale(mtcars) %*% pcv, type = 'n', xlab = 'Transmission', ylab = 'Power')
text(scale(mtcars) %*% pcv, labels = rownames(mtcars))


library(GPArotation)
promax(pc$rotation[, 1:2])

Outlier-detection with PCA

library(jpeg)
t <- tempfile()
download.file('http://bit.ly/nasa-img', t)
img <- readJPEG(t)
str(img)
h <- dim(img)[1]
w <- dim(img)[2]
m <- matrix(img, h*w)
str(m)
pca <- prcomp(m)
summary(pca)
pca$rotation
extractColors <- function(x) rgb(x[1], x[2], x[3])
(colors <- apply(abs(pca$rotation), 2, extractColors))
pie(pca$sdev, col = colors, labels = colors)

par(mfrow = c(1, 2), mar = rep(0, 4))
image(matrix(pca$x[, 1], h), col = gray.colors(100))
image(matrix(pca$x[, 2], h), col = gray.colors(100), yaxt = 'n')


Factor analysis

m <- subset(mtcars, select = c(mpg, cyl, hp, carb))
(f <- fa(m))
fa.diagram(f)
cor(f$scores, mtcars$disp)


Principal Component Analysis versus Factor Analysis

PCA is used to reduce the number of variables by creating principal components that then can be used in further projects instead of the original variables. This means that we try to extract the essence of the dataset in the means of artificially created variables, which best describe the variance of the data.

FA is the other way around, as it tries to identify unknown, latent variables to explain the original data. In plain English, we use the manifest variables from our empirical dataset to guess the internal structure of the data.


Multidimensional Scaling

as.matrix(eurodist)[1:5, 1:5]
(mds <- cmdscale(eurodist))
plot(mds)
plot(mds, type = 'n')
text(mds[, 1], mds[, 2], labels(eurodist))
mds <- cmdscale(dist(mtcars))
plot(mds, type = 'n')

text(mds[, 1], mds[, 2], rownames(mds))

No comments:

Post a Comment

Blog Archive