My DS Coding Bolg: Master R 6 - Beyond the linear trend line

The modeling workflow
1 First, fit the model with the main predictors and all the relevant confounders, and then reduce the number of confounders by dropping out the non-significant ones. 
2 Decide whether to use the continuous variables in their original or categorized form.
3 Try to achieve a better fit by testing for non-linear relationships, if they are pragmaticaly relevant.
4 Finally check the model assumptions.

Logistic regression
library(catdata)
data(deathpenalty)
library(vcdExtra)
deathpenalty.expand <- expand.dft(deathpenalty)
binom.model.0 <- glm(DeathPenalty ~ DefendantRace, data = deathpenalty.expand, family = binomial)
summary(binom.model.0)
exp(cbind(OR = coef(binom.model.0), confint(binom.model.0)))

binom.model.1 <- update(binom.model.0, . ~ . + VictimRace)
summary(binom.model.1)
exp(cbind(OR = coef(binom.model.1), confint(binom.model.1)))

prop.table(table(factor(deathpenalty.expand$VictimRace,labels = c("VictimRace=0", "VictimRace=1")),factor(deathpenalty.expand$DefendantRace, labels = c("DefendantRace=0", "DefendantRace=1"))), 1)

Data consideration
A general rule of thumb, logistic regression models require at least 10 events per predictors.

Goodness of model fit
library(lmtest)
lrtest(binom.model.1)
library(BaylorEdPsych)
PseudoR2(binom.model.1)

Model comparison
lrtest(binom.model.0, binom.model.1)

Models for count data
Poisson regression
dfa <- readRDS('SMART_2013.RData')
(ct <- xtabs(~model+failure, data=dfa))
dfa <- dfa[dfa$model %in% names(which(rowSums(ct) - ct[, 1] > 0)),]

library(ggplot2)
ggplot(rbind(dfa, data.frame(model='All', dfa[, -1] )), aes(failure)) + ylab("log(count)") + 
geom_histogram(binwidth = 1, drop=TRUE, origin = -0.5)  + 
scale_y_log10() + scale_x_continuous(breaks=c(0:10)) + 
facet_wrap( ~ model, ncol = 3) + ggtitle("Histograms by manufacturer") + theme_bw()

poiss.base <- glm(failure ~ model, offset(log(freq)),  family = 'poisson', data = dfa)
summary(poiss.base)
contrasts(dfa$model, sparse = TRUE)
lrtest(poiss.base)

Negative binomial regression
library(MASS)
model.negbin.0 <- glm.nb(failure ~ model, offset(log(freq)), data = dfa)
lrtest(poiss.base,model.negbin.0)

Multivariate non-linear models
model.negbin.1 <- update(model.negbin.0, . ~ . + capacity_bytes + age_month + temperature)
model.negbin.2 <- update(model.negbin.1, . ~ . + PendingSector)
lrtest(model.negbin.0, model.negbin.1, model.negbin.2)
summary(model.negbin.2)
exp(data.frame(exp_coef = coef(model.negbin.2)))
dfa$model <- relevel(dfa$model, 'WDC')


model.negbin.3 <- update(model.negbin.2, data = dfa)
library(broom)
format(tidy(model.negbin.3), digits = 4)

library(data.table)
dfa <- data.table(dfa)
dfa[, temp6 := cut2(temperature, g = 6)]
temperature.weighted.mean <- dfa[, .(wfailure = weighted.mean(failure, freq)), by = temp6] 
ggplot(temperature.weighted.mean, aes(x = temp6, y = wfailure)) +  
geom_bar(stat = 'identity') + xlab('Categorized temperature') + ylab('Weighted mean of disk faults') + theme_bw()

model.negbin.4 <- update(model.negbin.0, .~. + capacity_bytes + age_month + temp6 + PendingSector, data = dfa)
AIC(model.negbin.3,model.negbin.4)

weighted.means <- rbind(dfa[, .(l = 'capacity', f = weighted.mean(failure, freq)), by = .(v = capacity_bytes)], dfa[, .(l = 'age', f = weighted.mean(failure, freq)), by = .(v = age_month)])
ggplot(weighted.means, aes(x = l, y = f)) + geom_step() +
facet_grid(. ~ v, scales = 'free_x') + theme_bw() +
ylab('Weighted mean of disk faults') + xlab('')

dfa[, capacity_bytes := as.factor(capacity_bytes)]
dfa[, age8 := cut2(age_month, g = 8)]
model.negbin.5 <- update(model.negbin.0, .~. + capacity_bytes + age8 + temp6 + PendingSector, data = dfa)
AIC(model.negbin.5, model.negbin.4)
format(tidy(model.negbin.5), digits = 3)

tmnb5 <- tidy(model.negbin.5)
str(terms <- tmnb5$term[tmnb5$p.value < 0.05][-1])

library(plyr)
ci <- ldply(terms, function(t) confint(model.negbin.5, t))
names(ci) <- c('min', 'max')
ci$term <- terms
ci$variable <- sub('[A-Z0-9\\]\\[,() ]*$', '', terms, perl = TRUE)

ggplot(ci, aes(x = factor(term), color = variable)) + 
geom_errorbar(ymin = min, ymax = max) + xlab('') +
ylab('Coefficients (95% conf.int)') + theme_bw() + 
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = 'top')
PseudoR2(model.negbin.6 )
My DS Coding Bolg

Tuesday, February 2, 2016

Master R 6 - Beyond the linear trend line

No comments:

Post a Comment

Blog Archive