My DS Coding Bolg: Hypothesis Test in R and SAS

### R

The t.test( ) function produces a variety of t-tests.
Unlike most statistical packages, the default assumes unequal variance and applies the Welch df modification.

# one sample t-test
t.test(y,mu=3) # Ho: mu=3

# one sample median test
wilcox.test(y, mu=3)

# one sample binomial test
prop.test(sum(y), length(y), p=0.5)

# Chi-square goodness of fit
X <- matrix(c(172, 7, 6, 15), 2, 2)
chisq.test(table(X), p=c(10,10,10,70)/100)

# McNemar's Chi-squared test with continuity correction
X <- matrix(c(172, 7, 6, 15), 2, 2)
mcnemar.test(X)

# Wilcoxon-Mann-Whitney rank sum test
wilcox.test(y ~ x)

# Chi-square test
chisq.test(table(y, x))

# Fisher's exact test
fisher.test(table(y, x))

# Correlation
cor(y, x)
cor.test(y, x)

# Kruskal Wallis rank sum test
kruskal.test(y, x)

# Non-parametric correlation
cor.test(y, x, method = "spearman")

## Multual Information
mi.plugin(table(all$y, all$is_verified))

## Jaccard Similarity
1-dist(rbind(all$y, all$is_verified), method= "binary")

# Simple linear regression
lm(y ~ x)

# Multiple regression
lm(y ~ x1 + x2 + x3 + x4 + x5)

# independent 2-group t-test
t.test(y~x) # where y is numeric and x is a binary factor

# independent 2-group t-test
t.test(y1,y2) # where y1 and y2 are numeric

t.test(y1,y2, alternative="greater", var.equal=T)

# paired t-test
t.test(y1,y2,paired=TRUE) # where y1 & y2 are numeric

# Wilcoxon signed rank sum test
wilcox.test(y1, y2, paired = TRUE)

# One-way ANOVA
summary(aov(y ~ x))

# Factorial ANOVA
anova(lm(y ~ x1 * x2, data = hsb2))

# One-way repeated measures ANOVA
require(car)
require(foreign)
model <- lm(y ~ a + s, data = kirk)
analysis <- Anova(model, idata = kirk, idesign = ~s)
print(analysis)

# Repeated measures logistic regression
require(lme4)
glmer(y ~ x + (1 | id), data = exercise, family = binomial)

# Friedman test
friedman.test(cbind(x1, x2, x3))

# Simple logistic regression
glm(y ~ x, family = binomial)

# Factorial logistic regression
summary(glm(y ~ x1 * x2, data = hsb2, family = binomial))

# Multiple logistic regression
glm(y ~ x1 + x2, family = binomial)

# Analysis of covariance
summary(aov(y ~ x1 + x2))

# One-way MANOVA
summary(manova(cbind(x1, x2, x3) ~ x4))

# Multivariate multiple regression
require(car)
M1 <- lm(cbind(x1, x2) ~ x3 + x4 + x5 + x6, data = hsb2)
summary(Anova(M1))

# Canonical correlation
require(CCA)
cc(cbind(x1, x2), cbind(x3, x4))

# Factor analysis
require(psych)
fa(r = cor(model.matrix(~x1 + x2 + x3 + x4 + x5 - 1, data = hsb2)), rotate = "none", fm = "pa", 5)

# Principle component
princomp(~x1 + x2 + x3 + x4 + x5, data = hsb2)


Examples

1 Smoker vs NonSmoker

nonsmokers = c(18,22,21,17,20,17,23,20,22,21)
mean(nonsmokers)
sd(nonsmokers)
sqrt(var(nonsmokers)/length(nonsmokers))
smokers = c(16,20,14,21,20,18,13,15,17,21)
mean(smokers)
sd(smokers)
sqrt(var(smokers)/length(smokers))

plot(density(nonsmokers))            # output not shown
plot(density(smokers))               # output not shown
boxplot(nonsmokers,smokers,ylab="Scores on Digit Span Task",
names=c("nonsmokers","smokers"),
main="Digit Span Performance by\n Smoking Status")

2 Formula

mean.diff = mean(smokers) - mean(nonsmokers)
df = length(smokers) + length(nonsmokers) - 2
pooled.var = (sd(smokers)^2 * (length(smokers)-1) + sd(nonsmokers)^2 * (length(nonsmokers)-1)) / df
se.diff = sqrt(pooled.var/(length(smokers) + pooled.var/length(nonsmokers))
t.obt = mean.diff / se.diff
t.obt
p.value = 2*pt(t.obt,df=df)
p.value

3 odds ratios

odds.ratios <- function(clicks1, clicks2, nonclicks1, nonclicks2) {

#nonblank over blank

or = (clicks2/nonclicks2)/(clicks1/nonclicks1);

or.lower <- exp(log(or)-1.96*sqrt(1/clicks1 + 1/clicks2 + 1/nonclicks1 + 1/nonclicks2));

or.upper <- exp(log(or)+1.96*sqrt(1/clicks1 + 1/clicks2 + 1/nonclicks1 + 1/nonclicks2));

return(c(or, or.lower, or.upper))

#odds ratios

clicks1 <- as.numeric(data[2,3])

clicks2 <- as.numeric(data[3,3])

nonclicks1 <- as.numeric(data[2,2])-as.numeric(data[2,3])

nonclicks2 <- as.numeric(data[3,2])-as.numeric(data[3,3])

odds.ratios(clicks1, clicks2, nonclicks1, nonclicks2)

# Fisher's exact test

fisher.test(matrix(c(nonclicks1, clicks1, nonclicks2, clicks2), ncol = 2, byrow = T))


#### SAS

proc sql;

CONNECT TO NETEZZA (USER=*** PASSWORD=*** SERVER="***"DATABASE="ANALYTICS_STG");

create table data as

select * from connection to NETEZZA

(

select CAMPAIGN_ID,

SRCSYS_CELL_ID,

min(SRCSYS_CELL_NAME) SRCSYS_CELL_NAME,

SRCSYS_AUDIENCE_ID,

min(SRCSYS_AUDIENCE_NAME) SRCSYS_AUDIENCE_NAME,

count(distinct EMAIL_ADDRESS_ID) sent,

sum(case when upper(FEEDBACK_EVENT_CD)='OPEN' then 1 else 0 end) open,

sum(case when upper(FEEDBACK_EVENT_CD)='CLICK' then 1 else 0 end) click,

sum(case when upper(FEEDBACK_EVENT_CD)='CONVERSION' then 1 else 0 end) conv,

sum(case when upper(FEEDBACK_EVENT_CD)='UNSUB' then 1 else 0 end) unsub

from ANALYTICS_STG..LH_persado_email_and_resp

group by 1,2,4

order by 1,2,4;

)

;

quit;

data data1;

set data;

select(SRCSYS_AUDIENCE_ID);

when (841791043) control = 1;
...

when (834298683) control = 1;

when (834299793) control = 0;

when (834299803) control = 0;

when (834021323) control = 0;

when (834021333) control = 0;

when (834021373) control = 1;

otherwise control=.;

end;

if campaign_id = 9910 then delete;

open_rate=open/sent;

run;

/** One sample Binomial test;

proc freq data=data1;

by SRCSYS_AUDIENCE_ID;

table control / binomial(p=0.5);

exact binomial;

run;

**/



** Each Individual Campaigns;

** Two sample Binomial test;



proc ttest data=data1;

by campaign_id;

class control;

var open_rate;

run;

** Noparameteric Wilcoxon-Mann-Whitney test;

proc npar1way data=data1;

by campaign_id;

class control;

var open_rate;

run;

data data5;

set data1;

open_flag=1;

counts=open; output;

open_flag=0;

counts=sent-open; output;

run;

** Chisq Test;

proc freq data = data5;

by campaign_id;

weight counts;

tables control*open_flag / chisq;

run;



** Fisher's exact test;

proc freq data = data5;

by campaign_id;

weight counts;

tables control*open_flag / fisher;

run;



** McNemar test;

proc freq data=data5;

by campaign_id;

tables control*open_flag

exact mcnem;

weight counts;

run;




### All Campaigns

** One-way ANOVA;

proc glm data = data1;

class control campaign_id;

model open_rate = control campaign_id;

means control;

run;

quit;







** Paired t-test;

proc sql;

create table data2 as

select CAMPAIGN_ID, control,

sum(sent) as sent,

sum(open) as open,

sum(open)/sum(sent) as open_rate

from data1

group by CAMPAIGN_ID, control

;

run;

quit;

proc transpose data=data2 prefix=control out=data3;

by campaign_id;

id control;

var open_rate;

run;

proc ttest data = data3;

paired control0*control1;

run;







** Wilcoxon signed rank sum test;

data data4;

set data3;

diff = control0 - control1;

run;

proc univariate data=data4;

var diff;

run;

** McNemar test;

proc freq data=data5;

tables control*open_flag;

exact mcnem;

weight counts;

run;

proc freq data=data5;

by campaign_id;

tables control*open_flag;

exact mcnem;

weight counts;

run;



** logistic regression;

proc genmod data= data5 descending;

class campaign_id control;

freq counts;

model open_flag = campaign_id control/ dist=binomial link=logit;

run;

proc logistic data= data5 descending;

class campaign_id control;

freq counts;

model open_flag = campaign_id control/ scale = none aggregate;

output out=plot p=p l=l u=u;

run;



proc sgplot data=plot;

band x=control

lower=l

upper=u /group=campaign_id

transparency=.5;

series x=control

y=p /group=campaign_id;

run;

My DS Coding Bolg

Wednesday, November 9, 2011

Hypothesis Test in R and SAS

No comments:

Post a Comment

Blog Archive