Wednesday, November 9, 2011

Hypothesis Test in R and SAS

### R
The t.test( ) function produces a variety of t-tests.
Unlike most statistical packages, the default assumes unequal variance and applies the Welch df modification.
# one sample t-test
t.test(y,mu=3) # Ho: mu=3
# one sample median test wilcox.test(y, mu=3) # one sample binomial test prop.test(sum(y), length(y), p=0.5)
# Chi-square goodness of fit
X <- matrix(c(172, 7, 6, 15), 2, 2)
chisq.test(table(X), p=c(10,10,10,70)/100)

# McNemar's Chi-squared test with continuity correction
X <- matrix(c(172, 7, 6, 15), 2, 2)
mcnemar.test(X)

# Wilcoxon-Mann-Whitney rank sum test
wilcox.test(y ~ x)

# Chi-square test
chisq.test(table(y, x))

# Fisher's exact test
fisher.test(table(y, x))

# Correlation
cor(y, x)
cor.test(y, x)

# Kruskal Wallis rank sum test
kruskal.test(y, x)

# Non-parametric correlation
cor.test(y, x, method = "spearman")


## Multual Information
mi.plugin(table(all$y, all$is_verified))

## Jaccard Similarity
1-dist(rbind(all$y, all$is_verified), method= "binary")

# Simple linear regression
lm(y ~ x)

# Multiple regression
lm(y ~ x1 + x2 + x3 + x4 + x5)

# independent 2-group t-test
t.test(y~x) # where y is numeric and x is a binary factor

# independent 2-group t-test
t.test(y1,y2) # where y1 and y2 are numeric
t.test(y1,y2, alternative="greater", var.equal=T)
# paired t-test
t.test(y1,y2,paired=TRUE) # where y1 & y2 are numeric

# Wilcoxon signed rank sum test
wilcox.test(y1, y2, paired = TRUE)

# One-way ANOVA
summary(aov(y ~ x))

# Factorial ANOVA
anova(lm(y ~ x1 * x2, data = hsb2))

# One-way repeated measures ANOVA
require(car)
require(foreign)
model <- lm(y ~ a + s, data = kirk)
analysis <- Anova(model, idata = kirk, idesign = ~s)
print(analysis)

# Repeated measures logistic regression
require(lme4)
glmer(y ~ x + (1 | id), data = exercise, family = binomial)

# Friedman test
friedman.test(cbind(x1, x2, x3))

# Simple logistic regression
glm(y ~ x, family = binomial)

# Factorial logistic regression
summary(glm(y ~ x1 * x2, data = hsb2, family = binomial))

# Multiple logistic regression
glm(y ~ x1 + x2, family = binomial)

# Analysis of covariance
summary(aov(y ~ x1 + x2))

# One-way MANOVA
summary(manova(cbind(x1, x2, x3) ~ x4))

# Multivariate multiple regression
require(car)
M1 <- lm(cbind(x1, x2) ~ x3 + x4 + x5 + x6, data = hsb2)
summary(Anova(M1))

# Canonical correlation
require(CCA)
cc(cbind(x1, x2), cbind(x3, x4))

# Factor analysis
require(psych)
fa(r = cor(model.matrix(~x1 + x2 + x3 + x4 + x5 - 1, data = hsb2)), rotate = "none", fm = "pa", 5)

# Principle component
princomp(~x1 + x2 + x3 + x4 + x5, data = hsb2)


Examples

1 Smoker vs NonSmoker

nonsmokers = c(18,22,21,17,20,17,23,20,22,21)
mean(nonsmokers)
sd(nonsmokers)
sqrt(var(nonsmokers)/length(nonsmokers))
smokers = c(16,20,14,21,20,18,13,15,17,21)
mean(smokers)
sd(smokers)
sqrt(var(smokers)/length(smokers))

plot(density(nonsmokers))            # output not shown
plot(density(smokers))               # output not shown
boxplot(nonsmokers,smokers,ylab="Scores on Digit Span Task",
names=c("nonsmokers","smokers"),
main="Digit Span Performance by\n Smoking Status")

2 Formula

mean.diff = mean(smokers) - mean(nonsmokers)
df = length(smokers) + length(nonsmokers) - 2
pooled.var = (sd(smokers)^2 * (length(smokers)-1) + sd(nonsmokers)^2 * (length(nonsmokers)-1)) / df
se.diff = sqrt(pooled.var/(length(smokers) + pooled.var/length(nonsmokers))
t.obt = mean.diff / se.diff
t.obt
p.value = 2*pt(t.obt,df=df)
p.value

3 odds ratios
odds.ratios <- function(clicks1, clicks2, nonclicks1, nonclicks2) {
#nonblank over blank
or = (clicks2/nonclicks2)/(clicks1/nonclicks1);
or.lower <- exp(log(or)-1.96*sqrt(1/clicks1 + 1/clicks2 + 1/nonclicks1 + 1/nonclicks2));
or.upper <- exp(log(or)+1.96*sqrt(1/clicks1 + 1/clicks2 + 1/nonclicks1 + 1/nonclicks2));
return(c(or, or.lower, or.upper))
}

#odds ratios
clicks1 <- as.numeric(data[2,3])
clicks2 <- as.numeric(data[3,3])
nonclicks1 <- as.numeric(data[2,2])-as.numeric(data[2,3])
nonclicks2 <- as.numeric(data[3,2])-as.numeric(data[3,3])
odds.ratios(clicks1, clicks2, nonclicks1, nonclicks2)

# Fisher's exact test
fisher.test(matrix(c(nonclicks1, clicks1, nonclicks2, clicks2), ncol = 2, byrow = T))




#### SAS
proc sql;
CONNECT TO NETEZZA (USER=*** PASSWORD=*** SERVER="***"DATABASE="ANALYTICS_STG");
create table data as
select * from connection to NETEZZA
(
select CAMPAIGN_ID,
SRCSYS_CELL_ID,
min(SRCSYS_CELL_NAME) SRCSYS_CELL_NAME,
SRCSYS_AUDIENCE_ID,
min(SRCSYS_AUDIENCE_NAME) SRCSYS_AUDIENCE_NAME,
count(distinct EMAIL_ADDRESS_ID) sent,
sum(case when upper(FEEDBACK_EVENT_CD)='OPEN' then 1 else 0 end) open,
sum(case when upper(FEEDBACK_EVENT_CD)='CLICK' then 1 else 0 end) click,
sum(case when upper(FEEDBACK_EVENT_CD)='CONVERSION' then 1 else 0 end) conv,
sum(case when upper(FEEDBACK_EVENT_CD)='UNSUB' then 1 else 0 end) unsub
from ANALYTICS_STG..LH_persado_email_and_resp
group by 1,2,4
order by 1,2,4;
)
;
quit;
data data1;
set data;
select(SRCSYS_AUDIENCE_ID);
when (841791043) control = 1;
...
when (834298683) control = 1;
when (834299793) control = 0;
when (834299803) control = 0;
when (834021323) control = 0;
when (834021333) control = 0;
when (834021373) control = 1;
otherwise control=.;
end;
if campaign_id = 9910 then delete;
open_rate=open/sent;
run;
/** One sample Binomial test;
proc freq data=data1;
by SRCSYS_AUDIENCE_ID;
table control / binomial(p=0.5);
exact binomial;
run;
**/
** Each Individual Campaigns;
** Two sample Binomial test;
proc ttest data=data1;
by campaign_id;
class control;
var open_rate;
run;
** Noparameteric Wilcoxon-Mann-Whitney test;
proc npar1way data=data1;
by campaign_id;
class control;
var open_rate;
run;
data data5;
set data1;
open_flag=1;
counts=open; output;
open_flag=0;
counts=sent-open; output;
run;
** Chisq Test;
proc freq data = data5;
by campaign_id;
weight counts;
tables control*open_flag / chisq;
run;
** Fisher's exact test;
proc freq data = data5;
by campaign_id;
weight counts;
tables control*open_flag / fisher;
run;
** McNemar test;
proc freq data=data5;
by campaign_id;
tables control*open_flag
exact mcnem;
weight counts;
run;
### All Campaigns
** One-way ANOVA;
proc glm data = data1;
class control campaign_id;
model open_rate = control campaign_id;
means control;
run;
quit;
** Paired t-test;
proc sql;
create table data2 as
select CAMPAIGN_ID, control,
sum(sent) as sent,
sum(open) as open,
sum(open)/sum(sent) as open_rate
from data1
group by CAMPAIGN_ID, control
;
run;
quit;
proc transpose data=data2 prefix=control out=data3;
by campaign_id;
id control;
var open_rate;
run;
proc ttest data = data3;
paired control0*control1;
run;
** Wilcoxon signed rank sum test;
data data4;
set data3;
diff = control0 - control1;
run;
proc univariate data=data4;
var diff;
run;
** McNemar test;
proc freq data=data5;
tables control*open_flag;
exact mcnem;
weight counts;
run;
proc freq data=data5;
by campaign_id;
tables control*open_flag;
exact mcnem;
weight counts;
run;
** logistic regression;
proc genmod data= data5 descending;
class campaign_id control;
freq counts;
model open_flag = campaign_id control/ dist=binomial link=logit;
run;
proc logistic data= data5 descending;
class campaign_id control;
freq counts;
model open_flag = campaign_id control/ scale = none aggregate;
output out=plot p=p l=l u=u;
run;
proc sgplot data=plot;
band x=control
lower=l
upper=u /group=campaign_id
transparency=.5;
series x=control
y=p /group=campaign_id;
run;

No comments:

Post a Comment

Blog Archive