Thursday, February 4, 2016

Master R 8 - Polishing data

The types and origins of missing data

Missing Completely at Random (MCAR) means that every value in the dataset has the same probability of being missed, 
so no systematic error or distortion is to be expected due to missing data, and nor can we explain the pattern of missing values.

Missing at Random (MAR) is known or at least can be identified, although it has nothing to do with the actual missing values. 

For Missing Not at Random (MNAR), where data is missing for a specific reason that is highly related to the actual question, 
which classifies missing values as nonignorable non-response.


Identifying missing data

library(hflights)
table(complete.cases(hflights))
prop.table(table(complete.cases(hflights))) * 100
sort(sapply(hflights, function(x) sum(is.na(x))))

By-passing missing values

mean(cor(apply(hflights, 2, function(x) as.numeric(is.na(x)))), na.rm = TRUE)
Funs <- Filter(is.function, sapply(ls(baseenv()), get, baseenv()))
names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Funs))
names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Filter(is.function,sapply(ls('package:stats'), get, 'package:stats'))))

Overriding the default arguments of a function

myMean <- function(...) mean(..., na.rm = TRUE)
mean(c(1:5, NA))
myMean(c(1:5, NA))

library(rapportools)
mean(c(1:5, NA))
detach('package:rapportools')
mean(c(1:5, NA))

library(Defaults)
setDefaults(mean.default, na.rm = TRUE)
mean(c(1:5, NA))
setDefaults(mean, na.rm = TRUE)
mean

Getting rid of missing data

na.omit(c(1:5, NA))
na.exclude(c(1:5, NA))

x <- rnorm(10); y <- rnorm(10)
x[1] <- NA; y[2] <- NA
exclude <- lm(y ~ x, na.action = "na.exclude")
omit <- lm(y ~ x, na.action = "na.omit")
residuals(exclude)

m <- matrix(1:9, 3)
m[which(m %% 4 == 0, arr.ind = TRUE)] <- NA
m
na.omit(m)

Filtering missing data before or during the actual analysis

mean(hflights$ActualElapsedTime)
mean(hflights$ActualElapsedTime, na.rm = TRUE)
mean(na.omit(hflights$ActualElapsedTime))

library(microbenchmark)
NA.RM   <- function() mean(hflights$ActualElapsedTime, na.rm = TRUE)
NA.OMIT <- function() mean(na.omit(hflights$ActualElapsedTime))
microbenchmark(NA.RM(), NA.OMIT())

Data imputation

m[which(is.na(m), arr.ind = TRUE)] <- 0
m

ActualElapsedTime <- hflights$ActualElapsedTime
mean(ActualElapsedTime, na.rm = TRUE)
ActualElapsedTime[which(is.na(ActualElapsedTime))] <- mean(ActualElapsedTime, na.rm = TRUE)
mean(ActualElapsedTime)


library(Hmisc)
mean(impute(hflights$ActualElapsedTime, mean))
sd(hflights$ActualElapsedTime, na.rm = TRUE)
sd(ActualElapsedTime)
summary(iris)

library(missForest)
set.seed(81)
miris <- prodNA(iris, noNA = 0.2)
summary(miris)
iiris <- missForest(miris, xtrue = iris, verbose = TRUE)
str(iiris)

Comparing different imputation methods

miris <- miris[, 1:4]
iris_mean <- impute(miris, fun = mean)
iris_forest <- missForest(miris)
diag(cor(iris[, -5], iris_mean))
diag(cor(iris[, -5], iris_forest$ximp))

Not imputing missing values

Multiple imputation

Extreme values and outliers

detach('package:missForest')
detach('package:randomForest')

library(outliers)
outlier(hflights$DepDelay)
summary(hflights$DepDelay)

library(lattice)
bwplot(hflights$DepDelay)
IQR(hflights$DepDelay, na.rm = TRUE)

Testing extreme values

set.seed(83)
dixon.test(c(runif(10), pi))
model <- lm(hflights$DepDelay ~ 1)
model$coefficients
mean(hflights$DepDelay, na.rm = TRUE)
a <- 0.1
(n <- length(hflights$DepDelay))
(F <- qf(1 - (a/n), 1, n-2, lower.tail = TRUE))
(L <- ((n - 1) * F / (n - 2 + F))^0.5)
sum(abs(rstandard(model)) > L)

Using robust method

summary(lm(Sepal.Length ~ Petal.Length, data = miris))
lm(Sepal.Length ~ Petal.Length, data = iris)$coefficients

library(MASS)
summary(rlm(Sepal.Length ~ Petal.Length, data = miris))
f <- formula(Sepal.Length ~ Petal.Length)
cbind(orig =  lm(f, data = iris)$coefficients, lm   =  lm(f, data = miris)$coefficients, rlm  = rlm(f, data = miris)$coefficients)

miris$Sepal.Length[1] <- 14

cbind(orig = lm(f, data = iris)$coefficients, lm   = lm(f, data = miris)$coefficients, rlm  = rlm(f, data = miris)$coefficients)

No comments:

Post a Comment

Blog Archive