My DS Coding Bolg: Master R 9 - From Big to Small Data

Adequacy tests

Normality

library(hlfights)

JFK <- hflights[which(hflights$Dest == 'JFK'), c('TaxiIn', 'TaxiOut')]

JFK <- subset(hflights, Dest == 'JFK', select = c(TaxiIn, TaxiOut))

par(mfrow = c(1, 2))

qqnorm(JFK$TaxiIn, ylab = 'TaxiIn')

qqline(JFK$TaxiIn)

qqnorm(JFK$TaxiOut, ylab = 'TaxiOut')

qqline(JFK$TaxiOut)

shapiro.test(JFK$TaxiIn)

Multivariate normality

JFK <- na.omit(JFK)

library(MVN)

mardiaTest(JFK)

hzTest(JFK)

roystonTest(JFK)

par(mfrow = c(1, 2))

mvnPlot(mvt, type = "contour", default = TRUE)

mvnPlot(mvt, type = "persp", default = TRUE)

set.seed(42)

mvt <- roystonTest(MASS::mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), 2)))

mvnPlot(mvt, type = "contour", default = TRUE)

mvnPlot(mvt, type = "persp", default = TRUE)

Dependence of variables

hflights_numeric <- hflights[, which(sapply(hflights, is.numeric))]

cor(hflights_numeric, use = "pairwise.complete.obs")

str(cor(hflights_numeric, use = "pairwise.complete.obs"))

hflights_numeric <- hflights[,which(sapply(hflights, function(x) is.numeric(x) && var(x, na.rm = TRUE) != 0))]

table(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")))

library(ellipse)

plotcorr(cor(hflights_numeric, use = "pairwise.complete.obs"))

plotcorr(cor(data.frame(

1:10,

1:10 + runif(10),

1:10 + runif(10) * 5,

runif(10),

10:1,

check.names = FALSE)))

cor(hflights$FlightNum, hflights$Month)

KMO and Barlett's test

library(psych)

KMO(cor(hflights_numeric, use = "pairwise.complete.obs"))

cor(hflights_numeric[, c('Cancelled', 'AirTime')])

cancelled <- which(hflights_numeric$Cancelled == 1)

table(hflights_numeric$AirTime[cancelled], exclude = NULL)

table(hflights_numeric$Cancelled)

hflights_numeric <- subset(hflights_numeric, select = -Cancelled)

which(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")), arr.ind = TRUE)

hflights_numeric <- subset(hflights_numeric, select = -Diverted)

KMO(cor(hflights_numeric[, -c(14)], use = "pairwise.complete.obs"))

KMO(mtcars)

cortest.bartlett(cor(mtcars))

Principal Component Analysis

PCA algorithms

prcomp(mtcars, scale = TRUE)

summary(prcomp(mtcars, scale = TRUE))

sum(prcomp(scale(mtcars))$sdev^2)

Determining the number of components

prcomp(scale(mtcars))$sdev^2

VSS.scree(cor(mtcars))

scree(cor(mtcars))

fa.parallel(mtcars)

Interpreting components

pc <- prcomp(mtcars, scale = TRUE)

head(pc$x[, 1:2])

head(scale(mtcars) %*% pc$rotation[, 1:2])

summary(pc$x[, 1:2])

apply(pc$x[, 1:2], 2, sd)

round(cor(pc$x))

pc$rotation[, 1:2]

biplot(pc, cex = c(0.8, 1.2))

abline(h = 0, v = 0, lty = 'dashed')

cor(mtcars, pc$x[, 1:2])

Rotation methods

varimax(pc$rotation[, 1:2])

pcv <- varimax(pc$rotation[, 1:2])$loadings

plot(scale(mtcars) %*% pcv, type = 'n', xlab = 'Transmission', ylab = 'Power')

text(scale(mtcars) %*% pcv, labels = rownames(mtcars))

library(GPArotation)

promax(pc$rotation[, 1:2])

Outlier-detection with PCA

library(jpeg)

t <- tempfile()

download.file('http://bit.ly/nasa-img', t)

img <- readJPEG(t)

str(img)

h <- dim(img)[1]

w <- dim(img)[2]

m <- matrix(img, h*w)

str(m)

pca <- prcomp(m)

summary(pca)

pca$rotation

extractColors <- function(x) rgb(x[1], x[2], x[3])

(colors <- apply(abs(pca$rotation), 2, extractColors))

pie(pca$sdev, col = colors, labels = colors)

par(mfrow = c(1, 2), mar = rep(0, 4))

image(matrix(pca$x[, 1], h), col = gray.colors(100))

image(matrix(pca$x[, 2], h), col = gray.colors(100), yaxt = 'n')

Factor analysis

m <- subset(mtcars, select = c(mpg, cyl, hp, carb))

(f <- fa(m))

fa.diagram(f)

cor(f$scores, mtcars$disp)

Principal Component Analysis versus Factor Analysis

PCA is used to reduce the number of variables by creating principal components that then can be used in further projects instead of the original variables. This means that we try to extract the essence of the dataset in the means of artificially created variables, which best describe the variance of the data.

FA is the other way around, as it tries to identify unknown, latent variables to explain the original data. In plain English, we use the manifest variables from our empirical dataset to guess the internal structure of the data.

Multidimensional Scaling

as.matrix(eurodist)[1:5, 1:5]

(mds <- cmdscale(eurodist))

plot(mds)

plot(mds, type = 'n')

text(mds[, 1], mds[, 2], labels(eurodist))

mds <- cmdscale(dist(mtcars))

plot(mds, type = 'n')

text(mds[, 1], mds[, 2], rownames(mds))

My DS Coding Bolg

Saturday, February 6, 2016

Master R 9 - From Big to Small Data

No comments:

Post a Comment

Blog Archive