Adequacy tests
Normality
library(hlfights)
JFK <- hflights[which(hflights$Dest == 'JFK'), c('TaxiIn', 'TaxiOut')]
JFK <- subset(hflights, Dest == 'JFK', select = c(TaxiIn, TaxiOut))
par(mfrow = c(1, 2))
qqnorm(JFK$TaxiIn, ylab = 'TaxiIn')
qqline(JFK$TaxiIn)
qqnorm(JFK$TaxiOut, ylab = 'TaxiOut')
qqline(JFK$TaxiOut)
shapiro.test(JFK$TaxiIn)
Multivariate normality
JFK <- na.omit(JFK)
library(MVN)
mardiaTest(JFK)
hzTest(JFK)
roystonTest(JFK)
par(mfrow = c(1, 2))
mvnPlot(mvt, type = "contour", default = TRUE)
mvnPlot(mvt, type = "persp", default = TRUE)
set.seed(42)
mvt <- roystonTest(MASS::mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), 2)))
mvnPlot(mvt, type = "contour", default = TRUE)
mvnPlot(mvt, type = "persp", default = TRUE)
Dependence of variables
hflights_numeric <- hflights[, which(sapply(hflights, is.numeric))]
cor(hflights_numeric, use = "pairwise.complete.obs")
str(cor(hflights_numeric, use = "pairwise.complete.obs"))
hflights_numeric <- hflights[,which(sapply(hflights, function(x) is.numeric(x) && var(x, na.rm = TRUE) != 0))]
table(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")))
library(ellipse)
plotcorr(cor(hflights_numeric, use = "pairwise.complete.obs"))
plotcorr(cor(data.frame(
1:10,
1:10 + runif(10),
1:10 + runif(10) * 5,
runif(10),
10:1,
check.names = FALSE)))
cor(hflights$FlightNum, hflights$Month)
KMO and Barlett's test
library(psych)
KMO(cor(hflights_numeric, use = "pairwise.complete.obs"))
cor(hflights_numeric[, c('Cancelled', 'AirTime')])
cancelled <- which(hflights_numeric$Cancelled == 1)
table(hflights_numeric$AirTime[cancelled], exclude = NULL)
table(hflights_numeric$Cancelled)
hflights_numeric <- subset(hflights_numeric, select = -Cancelled)
which(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")), arr.ind = TRUE)
hflights_numeric <- subset(hflights_numeric, select = -Diverted)
KMO(cor(hflights_numeric[, -c(14)], use = "pairwise.complete.obs"))
KMO(mtcars)
cortest.bartlett(cor(mtcars))
Principal Component Analysis
PCA algorithms
prcomp(mtcars, scale = TRUE)
summary(prcomp(mtcars, scale = TRUE))
sum(prcomp(scale(mtcars))$sdev^2)
Determining the number of components
prcomp(scale(mtcars))$sdev^2
VSS.scree(cor(mtcars))
scree(cor(mtcars))
fa.parallel(mtcars)
Interpreting components
pc <- prcomp(mtcars, scale = TRUE)
head(pc$x[, 1:2])
head(scale(mtcars) %*% pc$rotation[, 1:2])
summary(pc$x[, 1:2])
apply(pc$x[, 1:2], 2, sd)
round(cor(pc$x))
pc$rotation[, 1:2]
biplot(pc, cex = c(0.8, 1.2))
abline(h = 0, v = 0, lty = 'dashed')
cor(mtcars, pc$x[, 1:2])
Rotation methods
varimax(pc$rotation[, 1:2])
pcv <- varimax(pc$rotation[, 1:2])$loadings
plot(scale(mtcars) %*% pcv, type = 'n', xlab = 'Transmission', ylab = 'Power')
text(scale(mtcars) %*% pcv, labels = rownames(mtcars))
library(GPArotation)
promax(pc$rotation[, 1:2])
Outlier-detection with PCA
library(jpeg)
t <- tempfile()
download.file('http://bit.ly/nasa-img', t)
img <- readJPEG(t)
str(img)
h <- dim(img)[1]
w <- dim(img)[2]
m <- matrix(img, h*w)
str(m)
pca <- prcomp(m)
summary(pca)
pca$rotation
extractColors <- function(x) rgb(x[1], x[2], x[3])
(colors <- apply(abs(pca$rotation), 2, extractColors))
pie(pca$sdev, col = colors, labels = colors)
par(mfrow = c(1, 2), mar = rep(0, 4))
image(matrix(pca$x[, 1], h), col = gray.colors(100))
image(matrix(pca$x[, 2], h), col = gray.colors(100), yaxt = 'n')
Factor analysis
m <- subset(mtcars, select = c(mpg, cyl, hp, carb))
(f <- fa(m))
fa.diagram(f)
cor(f$scores, mtcars$disp)
Principal Component Analysis versus Factor Analysis
PCA is used to reduce the number of variables by creating principal components that then can be used in further projects instead of the original variables. This means that we try to extract the essence of the dataset in the means of artificially created variables, which best describe the variance of the data.
FA is the other way around, as it tries to identify unknown, latent variables to explain the original data. In plain English, we use the manifest variables from our empirical dataset to guess the internal structure of the data.
Multidimensional Scaling
as.matrix(eurodist)[1:5, 1:5]
(mds <- cmdscale(eurodist))
plot(mds)
plot(mds, type = 'n')
text(mds[, 1], mds[, 2], labels(eurodist))
mds <- cmdscale(dist(mtcars))
plot(mds, type = 'n')
text(mds[, 1], mds[, 2], rownames(mds))
No comments:
Post a Comment