My DS Coding Bolg: November 2011

Wednesday, November 9, 2011

Mail Solicitation and Response Using SQL

/***********************************************************************/
-- Mail Solicitations;
/***********************************************************************/
-- create features;
drop table ANALYTICS_STG..CAMPAIGN_ADDRESS_3;
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_3 as
select a.*,
row_number() over(partition by a.EMAIL_ADDRESS_ID order by a.EMAIL_ADDRESS_ID, a.SEND_TS) seq_freq,
extract(DAY from a.SEND_TS - LAG(a.SEND_TS, 1) OVER (partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEND_TS)) as recency,
b.reach_freq as tot_freq
from ANALYTICS_STG..CAMPAIGN_ADDRESS_2 a
join (select EMAIL_ADDRESS_ID, count(*) as reach_freq from ANALYTICS_STG..LH_CAMPAIGN_ADDRESS_2 group by 1) b
on a.EMAIL_ADDRESS_ID = b.EMAIL_ADDRESS_ID;

/***********************************************************************/
/* Multiple Email Responses */
/***********************************************************************/
/* Email Promotions with EM feedback */
drop table ANALYTICS_STG..RESPONSE_ADDRESS_1;
create table ANALYTICS_STG..RESPONSE_ADDRESS_1 as
select a.CAMPAIGN_ID, a.ELECTRONIC_ADDRESS_ID, a.CONCEPT_ID, a.FEEDBACK_EVENT_CD, a.MAIL_TS,
a.FEEDBACK_EVENT_TS,
a.CUST_HHLD_ADDR_SKID,
a.CUSTOMER_ID,
a.SRCSYS_CUSTOMER_ID
from EDW_MCF_VW..CAMPAIGN_FEEDBACK_EM a
inner join (select CAMPAIGN_ID, EMAIL_ADDRESS_ID, count(*) from ANALYTICS_STG..LH_CAMPAIGN_ADDRESS_3 group by 1,2) b
on a.CAMPAIGN_ID = b.CAMPAIGN_ID
and a.ELECTRONIC_ADDRESS_ID = b.EMAIL_ADDRESS_ID
where a.MAIL_TS between '2012-02-18' and '2014-03-08'
and a.FEEDBACK_EVENT_TS between '2012-02-18' and '2014-03-08'
;

-- dedupe email feedback;
drop table ANALYTICS_STG..RESPONSE_ADDRESS_2;
create table ANALYTICS_STG..RESPONSE_ADDRESS_2 as
select a.*,
case when upper(a.FEEDBACK_EVENT_CD)='OPEN' then 1 else 0 end FEEDBACK_EVENT_CD_1,
case when upper(a.FEEDBACK_EVENT_CD)='CLICK' then 1 else 0 end FEEDBACK_EVENT_CD_2,
case when upper(a.FEEDBACK_EVENT_CD)='UNDEL' then 1 else 0 end FEEDBACK_EVENT_CD_3,
case when upper(a.FEEDBACK_EVENT_CD)='CONVERSION' then 1 else 0 end FEEDBACK_EVENT_CD_4,
case when upper(a.FEEDBACK_EVENT_CD)='UNSUB' then 1 else 0 end FEEDBACK_EVENT_CD_5,
case when upper(a.FEEDBACK_EVENT_CD)='SPAM' then 1 else 0 end FEEDBACK_EVENT_CD_6
from (
select *, row_number()
over (partition by CAMPAIGN_ID, ELECTRONIC_ADDRESS_ID, CONCEPT_ID, FEEDBACK_EVENT_CD, MAIL_TS
order by CAMPAIGN_ID, ELECTRONIC_ADDRESS_ID, CONCEPT_ID, FEEDBACK_EVENT_CD, MAIL_TS, FEEDBACK_EVENT_TS
) row_id
from ANALYTICS_STG..RESPONSE_ADDRESS_1
) a
where a.row_id = 1;

/* 2.2.1 drop non-open, non-click, non-unsubscribe */
drop table ANALYTICS_STG..RESPONSE_ADDRESS_3;
create table ANALYTICS_STG..RESPONSE_ADDRESS_3 as
select a.*,
row_number() over (partition by a.ELECTRONIC_ADDRESS_ID order by a.ELECTRONIC_ADDRESS_ID, a.MAIL_TS, a.FEEDBACK_EVENT_TS) seq_engagement,
b.tot_engagement,
b.tot_open,
b.tot_click,
b.tot_unsub,
extract(DAY from (a.FEEDBACK_EVENT_TS - a.MAIL_TS)) as feedback_day
from ANALYTICS_STG..LH_RESPONSE_ADDRESS_2 a
inner join (select ELECTRONIC_ADDRESS_ID,
count(*) as tot_engagement,
sum(FEEDBACK_EVENT_CD_1) as tot_open,
sum(FEEDBACK_EVENT_CD_2) as tot_click,
sum(FEEDBACK_EVENT_CD_5) as tot_unsub
from ANALYTICS_STG..LH_RESPONSE_ADDRESS_2
where FEEDBACK_EVENT_CD_1 = 1 or FEEDBACK_EVENT_CD_2 = 1 or FEEDBACK_EVENT_CD_5 = 1
group by 1
) b
on a.ELECTRONIC_ADDRESS_ID = b.ELECTRONIC_ADDRESS_ID
where a.FEEDBACK_EVENT_CD_1 = 1 or a.FEEDBACK_EVENT_CD_2 = 1 or a.FEEDBACK_EVENT_CD_5 = 1;

/***********************************************************************/
/* Responses with email promotion; */
/***********************************************************************/
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_1 as
select a.*,
b.FEEDBACK_EVENT_TS,
b.FEEDBACK_EVENT_CD,
b.FEEDBACK_EVENT_CD_1,
b.FEEDBACK_EVENT_CD_2,
b.FEEDBACK_EVENT_CD_5,
b.SEQ_ENGAGEMENT,
b.TOT_CLICK,
b.TOT_OPEN,
b.TOT_ENGAGEMENT,
b.TOT_UNSUB,
b.FEEDBACK_DAY
from ANALYTICS_STG..CAMPAIGN_ADDRESS_3 a
inner join ANALYTICS_STG..RESPONSE_ADDRESS_3 b
on a.CAMPAIGN_ID = b.CAMPAIGN_ID
and a.CONCEPT_ID = b.CONCEPT_ID
and a.EMAIL_ADDRESS_ID = b.ELECTRONIC_ADDRESS_ID
and a.SEND_TS = b.MAIL_TS
;

/*2.2.2 Attributed Solicitation/Promotion and Response*/
drop table ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_2;
-- 2,147,483,647 rows affected;
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_2 as
select a.*,
b.FEEDBACK_EVENT_TS,
b.FEEDBACK_EVENT_CD,
b.FEEDBACK_EVENT_CD_1,
b.FEEDBACK_EVENT_CD_2,
b.FEEDBACK_EVENT_CD_5,
b.TOT_ENGAGEMENT,
b.SEQ_ENGAGEMENT,
b.TOT_CLICK,
b.TOT_OPEN,
b.TOT_UNSUB,
b.FEEDBACK_DAY,
SUM(case when b.FEEDBACK_EVENT_CD_1=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_OPEN_PERC,
SUM(case when b.FEEDBACK_EVENT_CD_2=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_CLICK_PERC,
SUM(case when b.FEEDBACK_EVENT_CD_5=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_UNSUB_PERC
from ANALYTICS_STG..CAMPAIGN_ADDRESS_3 a
left join ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_1 b
on a.CAMPAIGN_ID=b.CAMPAIGN_ID
and a.CONCEPT_ID=b.CONCEPT_ID
and a.EMAIL_ADDRESS_ID = b.EMAIL_ADDRESS_ID
and a.SEQ_FREQ = b.SEQ_FREQ
;

-- Sequencial Frequency
select SEQ_FREQ,
NTILE(20) OVER(ORDER BY SEQ_FREQ DESC NULLS LAST) as SEQ_FREQ_DECILE,
count(distinct EMAIL_ADDRESS_ID) cnt,
sum(FEEDBACK_EVENT_CD_1) open,
sum(FEEDBACK_EVENT_CD_2) click,
sum(FEEDBACK_EVENT_CD_5) unsubscrib
from ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_2
group by 1
order by 1;

select aa.seq_freq,
aa.cnt,
SUM(aa.cnt) OVER(ORDER BY aa.SEQ_FREQ rows unbounded preceding) cum_email,
aa.OPEN,
SUM(aa.open) OVER(ORDER BY aa.SEQ_FREQ rows unbounded preceding) cum_open,
aa.CLICK,
SUM(aa.click) OVER(ORDER BY aa.SEQ_FREQ rows unbounded preceding) cum_click,
aa.UNSUBSCRIB,
SUM(aa.unsubscrib) OVER(ORDER BY aa.SEQ_FREQ rows unbounded preceding) cum_unsubscrib
from lh_tmp aa
order by aa.SEQ_FREQ;

PROC LOGISTIC to handle Randomized Complete Block Design (RCBD)

The main idea of blocking is to ensure that the random sample you draw from the population of interest does not, by random chance, have some undesirable properties.

*Use risk scores (riskScr) and response groups (respGrp) to create blocks;
data blocks;
set s.dem0303;
if riskScr < 650 then riskGrp=1;
else if riskScr < 750 then riskGrp=2;
else riskGrp =3;
if respScr < 260 then respGrp=1;
else if respScr < 280 then respGrp=2;
else respGrp =3;
block = compress(riskGrp||respGrp);
random=ranuni(8675309);
run;

proc sort data=blocks;
by block;
run;

*randomly assigned units to treatments within each block;
proc rank data=blocks out=blocks groups=8;
by block;
var random;
ranks treatment;
run;

*Generate design matrix (factors: interestRate sticker priceGraphic);
data blocks;
set blocks;
array factors(3) interestRate sticker priceGraphic;
do i = 1 to dim(factors);
factors(i)=substr(put(treatment,binary3.),i,1);
end;
run;

*Analysis after collecting the data;
*Block | InterestRate | Sticker | PriceGraphic;
ods select globalTests type3;
title "Block | InterestRate | Sticker | PriceGraphic";
proc logistic data=s.dem0304 namelen=40;
class block interestRate sticker priceGraphic;
model response(event="1")=
block
interestRate|sticker|priceGraphic @1
interestRate|sticker|priceGraphic @2
interestRate|sticker|priceGraphic;
run;

Note:
It is possible to get the same parameter estimates in a slightly easier to read format. The above code uses the '@' notation in the MODEL statement to arrange the terms in the model by their polynomial order. A|B|C|D in a MODEL statement yields every term from the main effects up to the three-factor interaction.

SAS Tricks: Padding zeros & Transpose Cross product sales

## Padding zeros
data expiredate;

do j=2010 to 2011;
if _n_=1 then exp_date=mdy(1,1,j);
i=1;
format exp_date monyy.;
exp_wk=year(exp_date)||put(i,z2.);
output;

do i=2 to int(365/7);
exp_date=exp_date+7;
format exp_date monyy.;
exp_wk=year(exp_date)||put(i,z2.);
output;
end;

end;

goptions ftext='Arial' ctext=BLACK htext=1.5 cells dev=activex;
ods html file="C:\expdate.html";
ods listing close;
options nodate nonumber;
proc print data=expiredate;
run;
ods html close;
ods listing;

## Transpose Cross product sales

data out3.multiple;

set out1.multiple;
length multiple_purchses $50 purchse_type $20;
array num(10) num_1110 num_1140 num_1160 num_1210 num_1610 num_1720 num_1750 num_1810 num_1820 num_1860;
multiple_purchses=put(num(1),3.);
do i=2 to 10;
multiple_purchses=compress(multiple_purchses||'-'||put(num(i),3.));
end;

purchse_type=put(min(num(1),1),3.);
do i=2 to 10;
purchse_type=compress(purchse_type||'-'||put(min(num(i),1),3.));
end;
run;

How SAS handles missing values

It is important to understand how SAS procedures handle missing data if you have missing data. To know how a procedure handles missing data, you should consult the SAS manual. Here is a brief overview of how some common SAS procedures handle missing data.

proc means
For each variable, the number of non-missing values are used

proc freq
By default, missing values are excluded and percentages are based on the number of non-missing values. If you use the missing option on the tables statement, the percentages are based on the total number of observations (non-missing and missing) and the percentage of missing values are reported in the table.

proc corr
By default, correlations are computed based on the number of pairs with non-missing data (pairwise deletion of missing data). The nomiss option can be used on the proc corr statement to request that correlations be computed only for observations that have non-missing data for all variables on the var statement (listwise deletion of missing data).

proc reg
If any of the variables on the model or var statement are missing, they are excluded from the analysis (i.e., listwise deletion of missing data)

proc factor
Missing values are deleted listwise, i.e., observations with missing values on any of the variables in the analysis are omitted from the analysis.

proc glm
The handling of missing values in proc glm can be complex to explain. If you have an analysis with just one variable on the left side of the model statement (just one outcome or dependent variable), observations are eliminated if any of the variables on the model statement are missing. Likewise, if you are performing a repeated measures ANOVA or a MANOVA, then observations are eliminated if any of the variables in the model statement are missing. For other situations, see the SAS/STAT manual about proc glm.

*Get missing values freqencies of numeric variables:
proc freq data=raw;
table variables /missing;
run;

*Get missing values levels of categorical variables:
data class;
if 0 then set sashelp.class;
do i=1 to 10;output;
end;
stop;
run;

proc format;
value allmiss ._-.z=. other=1;
value $allmiss ' '=' ' other='1';
run;

ods select nlevels;
ods output nlevels=nlevels(keep=TableVar NNonMissLevels where=(NNonMissLevels=0));
proc freq levels;
format _character_ $allmiss. _numeric_ allmiss.;
run;
ods output close;

Hypothesis Test in R and SAS

### R

The t.test( ) function produces a variety of t-tests.
Unlike most statistical packages, the default assumes unequal variance and applies the Welch df modification.

# one sample t-test
t.test(y,mu=3) # Ho: mu=3

# one sample median test
wilcox.test(y, mu=3)

# one sample binomial test
prop.test(sum(y), length(y), p=0.5)

# Chi-square goodness of fit
X <- matrix(c(172, 7, 6, 15), 2, 2)
chisq.test(table(X), p=c(10,10,10,70)/100)

# McNemar's Chi-squared test with continuity correction
X <- matrix(c(172, 7, 6, 15), 2, 2)
mcnemar.test(X)

# Wilcoxon-Mann-Whitney rank sum test
wilcox.test(y ~ x)

# Chi-square test
chisq.test(table(y, x))

# Fisher's exact test
fisher.test(table(y, x))

# Correlation
cor(y, x)
cor.test(y, x)

# Kruskal Wallis rank sum test
kruskal.test(y, x)

# Non-parametric correlation
cor.test(y, x, method = "spearman")

## Multual Information
mi.plugin(table(all$y, all$is_verified))

## Jaccard Similarity
1-dist(rbind(all$y, all$is_verified), method= "binary")

# Simple linear regression
lm(y ~ x)

# Multiple regression
lm(y ~ x1 + x2 + x3 + x4 + x5)

# independent 2-group t-test
t.test(y~x) # where y is numeric and x is a binary factor

# independent 2-group t-test
t.test(y1,y2) # where y1 and y2 are numeric

t.test(y1,y2, alternative="greater", var.equal=T)

# paired t-test
t.test(y1,y2,paired=TRUE) # where y1 & y2 are numeric

# Wilcoxon signed rank sum test
wilcox.test(y1, y2, paired = TRUE)

# One-way ANOVA
summary(aov(y ~ x))

# Factorial ANOVA
anova(lm(y ~ x1 * x2, data = hsb2))

# One-way repeated measures ANOVA
require(car)
require(foreign)
model <- lm(y ~ a + s, data = kirk)
analysis <- Anova(model, idata = kirk, idesign = ~s)
print(analysis)

# Repeated measures logistic regression
require(lme4)
glmer(y ~ x + (1 | id), data = exercise, family = binomial)

# Friedman test
friedman.test(cbind(x1, x2, x3))

# Simple logistic regression
glm(y ~ x, family = binomial)

# Factorial logistic regression
summary(glm(y ~ x1 * x2, data = hsb2, family = binomial))

# Multiple logistic regression
glm(y ~ x1 + x2, family = binomial)

# Analysis of covariance
summary(aov(y ~ x1 + x2))

# One-way MANOVA
summary(manova(cbind(x1, x2, x3) ~ x4))

# Multivariate multiple regression
require(car)
M1 <- lm(cbind(x1, x2) ~ x3 + x4 + x5 + x6, data = hsb2)
summary(Anova(M1))

# Canonical correlation
require(CCA)
cc(cbind(x1, x2), cbind(x3, x4))

# Factor analysis
require(psych)
fa(r = cor(model.matrix(~x1 + x2 + x3 + x4 + x5 - 1, data = hsb2)), rotate = "none", fm = "pa", 5)

# Principle component
princomp(~x1 + x2 + x3 + x4 + x5, data = hsb2)


Examples

1 Smoker vs NonSmoker

nonsmokers = c(18,22,21,17,20,17,23,20,22,21)
mean(nonsmokers)
sd(nonsmokers)
sqrt(var(nonsmokers)/length(nonsmokers))
smokers = c(16,20,14,21,20,18,13,15,17,21)
mean(smokers)
sd(smokers)
sqrt(var(smokers)/length(smokers))

plot(density(nonsmokers))            # output not shown
plot(density(smokers))               # output not shown
boxplot(nonsmokers,smokers,ylab="Scores on Digit Span Task",
names=c("nonsmokers","smokers"),
main="Digit Span Performance by\n Smoking Status")

2 Formula

mean.diff = mean(smokers) - mean(nonsmokers)
df = length(smokers) + length(nonsmokers) - 2
pooled.var = (sd(smokers)^2 * (length(smokers)-1) + sd(nonsmokers)^2 * (length(nonsmokers)-1)) / df
se.diff = sqrt(pooled.var/(length(smokers) + pooled.var/length(nonsmokers))
t.obt = mean.diff / se.diff
t.obt
p.value = 2*pt(t.obt,df=df)
p.value

3 odds ratios

odds.ratios <- function(clicks1, clicks2, nonclicks1, nonclicks2) {

#nonblank over blank

or = (clicks2/nonclicks2)/(clicks1/nonclicks1);

or.lower <- exp(log(or)-1.96*sqrt(1/clicks1 + 1/clicks2 + 1/nonclicks1 + 1/nonclicks2));

or.upper <- exp(log(or)+1.96*sqrt(1/clicks1 + 1/clicks2 + 1/nonclicks1 + 1/nonclicks2));

return(c(or, or.lower, or.upper))

#odds ratios

clicks1 <- as.numeric(data[2,3])

clicks2 <- as.numeric(data[3,3])

nonclicks1 <- as.numeric(data[2,2])-as.numeric(data[2,3])

nonclicks2 <- as.numeric(data[3,2])-as.numeric(data[3,3])

odds.ratios(clicks1, clicks2, nonclicks1, nonclicks2)

# Fisher's exact test

fisher.test(matrix(c(nonclicks1, clicks1, nonclicks2, clicks2), ncol = 2, byrow = T))


#### SAS

proc sql;

CONNECT TO NETEZZA (USER=*** PASSWORD=*** SERVER="***"DATABASE="ANALYTICS_STG");

create table data as

select * from connection to NETEZZA

(

select CAMPAIGN_ID,

SRCSYS_CELL_ID,

min(SRCSYS_CELL_NAME) SRCSYS_CELL_NAME,

SRCSYS_AUDIENCE_ID,

min(SRCSYS_AUDIENCE_NAME) SRCSYS_AUDIENCE_NAME,

count(distinct EMAIL_ADDRESS_ID) sent,

sum(case when upper(FEEDBACK_EVENT_CD)='OPEN' then 1 else 0 end) open,

sum(case when upper(FEEDBACK_EVENT_CD)='CLICK' then 1 else 0 end) click,

sum(case when upper(FEEDBACK_EVENT_CD)='CONVERSION' then 1 else 0 end) conv,

sum(case when upper(FEEDBACK_EVENT_CD)='UNSUB' then 1 else 0 end) unsub

from ANALYTICS_STG..LH_persado_email_and_resp

group by 1,2,4

order by 1,2,4;

)

;

quit;

data data1;

set data;

select(SRCSYS_AUDIENCE_ID);

when (841791043) control = 1;
...

when (834298683) control = 1;

when (834299793) control = 0;

when (834299803) control = 0;

when (834021323) control = 0;

when (834021333) control = 0;

when (834021373) control = 1;

otherwise control=.;

end;

if campaign_id = 9910 then delete;

open_rate=open/sent;

run;

/** One sample Binomial test;

proc freq data=data1;

by SRCSYS_AUDIENCE_ID;

table control / binomial(p=0.5);

exact binomial;

run;

**/



** Each Individual Campaigns;

** Two sample Binomial test;



proc ttest data=data1;

by campaign_id;

class control;

var open_rate;

run;

** Noparameteric Wilcoxon-Mann-Whitney test;

proc npar1way data=data1;

by campaign_id;

class control;

var open_rate;

run;

data data5;

set data1;

open_flag=1;

counts=open; output;

open_flag=0;

counts=sent-open; output;

run;

** Chisq Test;

proc freq data = data5;

by campaign_id;

weight counts;

tables control*open_flag / chisq;

run;



** Fisher's exact test;

proc freq data = data5;

by campaign_id;

weight counts;

tables control*open_flag / fisher;

run;



** McNemar test;

proc freq data=data5;

by campaign_id;

tables control*open_flag

exact mcnem;

weight counts;

run;




### All Campaigns

** One-way ANOVA;

proc glm data = data1;

class control campaign_id;

model open_rate = control campaign_id;

means control;

run;

quit;







** Paired t-test;

proc sql;

create table data2 as

select CAMPAIGN_ID, control,

sum(sent) as sent,

sum(open) as open,

sum(open)/sum(sent) as open_rate

from data1

group by CAMPAIGN_ID, control

;

run;

quit;

proc transpose data=data2 prefix=control out=data3;

by campaign_id;

id control;

var open_rate;

run;

proc ttest data = data3;

paired control0*control1;

run;







** Wilcoxon signed rank sum test;

data data4;

set data3;

diff = control0 - control1;

run;

proc univariate data=data4;

var diff;

run;

** McNemar test;

proc freq data=data5;

tables control*open_flag;

exact mcnem;

weight counts;

run;

proc freq data=data5;

by campaign_id;

tables control*open_flag;

exact mcnem;

weight counts;

run;



** logistic regression;

proc genmod data= data5 descending;

class campaign_id control;

freq counts;

model open_flag = campaign_id control/ dist=binomial link=logit;

run;

proc logistic data= data5 descending;

class campaign_id control;

freq counts;

model open_flag = campaign_id control/ scale = none aggregate;

output out=plot p=p l=l u=u;

run;



proc sgplot data=plot;

band x=control

lower=l

upper=u /group=campaign_id

transparency=.5;

series x=control

y=p /group=campaign_id;

run;

INFORMAT AND FORMAT

INFORMAT gives SAS special instructions for reading a variable.

FORMAT gives SAS special instructions for writing a variable.

Character to Numeric: newvar=input(oldvar, informat);
The informat must be the type you are converting to-numeric;

Numeric to Character: newvar=put(oldvar, format);
The format must be the type you are converting from-numeric.

PCA vs. EFA

A principal component is a linear combination of weighted observed variables. Principal components are uncorrelated and orthogonal.

Principal component analysis minimizes the sum of the squared perpendicular distances to the axis of the principal component while least squares regression minimizes the sum of the squared distances perpendicular to the x axis (not perpendicular to the fitted line) (Truxillo, 2003). Principal component scores are actual scores.

Principal Component Analysis (PCA)

- Is a variable reduction technique

- Is used when variables are highly correlated

- Reduces the number of observed variables to a smaller number of principal components which account for most of the variance of the observed variables

- Is a large sample procedure

Eigenvectors are the weights in a linear transformation when computing principal component scores.

Eigenvalues indicate the amount of variance explained by each principal component or each factor.

Exploratory Factor Analysis (EFA)

A latent construct can be measured indirectly by determining its influence to responses on measured variables. A latent construct could is also referred to as a factor, underlying construct, or unobserved variable. Unique factors refer to unreliability due to measurement error and variation in the data. Factor scores are estimates of underlying latent constructs.

An observed variable “loads” on a factors if it is highly correlated with the factor, has an eigenvector of greater magnitude on that factor.

Communality is the variance in observed variables accounted for by a common factors. Communality is more relevant to EFA than PCA (Hatcher, 1994).

- Is a variable reduction technique which identifies the number of latent constructs and the underlying factor structure of a set of variables

- Hypothesizes an underlying construct, a variable not measured directly

- Estimates factors which influence responses on observed variables

- Allows you to describe and identify the number of latent constructs (factors)

- Includes unique factors, error due to unreliability in measurement

- Traditionally has been used to explore the possible underlying factor structure of a set of measured variables without imposing any preconceived structure on the outcome.

Feature selection approaches try to find a subset of the original variables (also called features or attributes). There are three strategies:
filter (information gain)
wapper(each guided by accuracy)
embedded(features are selected to be added or be removed while building the model based on the prediction errors)

The main linear technique for dimensionality reduction, principal component analysis, performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. In practice, the covariance (and sometimes the correlation) matrix of the data is constructed and the eigen vectors on this matrix are computed. The eigenvectors that correspond to the largest eigenvalues (the principal components) can now be used to reconstruct a large fraction of the variance of the original data. Moreover, the first few eigenvectors can often be interpreted in terms of the large-scale physical behavior of the system. The original space (with dimension of the number of points) has been reduced (with data loss, but hopefully retaining the most important variance) to the space spanned by a few eigenvectors.

PROC CLUSTER, PROC FASTCLUS to CLUSTERING

The following SAS codes is to use Proc cluster to generate random seeds (centroids/initial points), and then use Proc fastclus to create the clusters.

%let demo=var1 var2 ;

title2 'Hierarchical Solution (WARD''S)';
proc cluster data=out1.training method=ward k=10 trim=0.1 outtree=tree noprint;
var &demo;
copy keys;
run;

proc tree data=tree nclusters=5 dock=5 out=out1.results noprint;
copy &demo;
run;

proc freq data=out1.results ;
table cluster;
run;

/* generate the centroids of the hierarchical clusters */
title1 'Cluster Centroids';
proc means data=out1.results;
class cluster;
var &demo;
output mean= out=centroids(where=(_type_ = 1));
run;

title1 'Score Development Data against the Centroids';
proc fastclus data=out1.training seed=centroids maxclusters=5 least=2 out=results noprint;
var &demo;
run;

title1 'USS (5 clusters)';
proc means data=results uss;
var distance;
run;

proc freq data=results;
table cluster;
run;

Discrete Distribution and Continuous Distributions

Discrete Distribution

Bernoulli distribution / Binomial distribution

Multinoulli distribution (categorical distribution) / Multinomial distribution

Poisson distribution

Uniform distribution

Continuous Distribution

Normal distribution

1 it has two parameters which are easy to interpret, and which capture some of the most basic properties of a distribution, namely its mean and variance.

2 the central limit theorem tells us that sums of independent random variables have an approximately normal distribution, making it a good choice for modeling residual erros or noise.

3 the normal distribution makes the least number of assumptions(has maximum entropy), subject to the constraint of having a specified mean and variance, This makes it a good default choice in may cases.

4 it has a simple mathematical form, which results in easy to implement, but often highly effective methods.

Student t distribution

Student t is more robust because it has heavier tails, at least for small v.

If v=1, this distribution is known as the Cauchy or Lorentz distribution. This is notable for having such heavy tails that the integral that defines the mean does not converge.

To ensure finite variance, require v>2.

It is common to use v=4, which gives good performance in a range of problems.

The smaller v is, the fatter the tails become. For v>>5, the Student distribution rapidly approaches a normal and loses its robustness properties.

Laplace distribution

It is robust to outliers, and also put mores probability density at o than the normal. It is a useful way to encourage sparsity in a model.

Note that the Student distribution is not log-concave for any parameter value, unlike the Laplace distribution, which is always log-concave(and log-convex...). Nevertheless, both are unimodela.

Gamma distribution

1 a is the shape parameter, b is the rate parameter.

2 if a<=1, the mode is at 0, otherwise mode is > 0. As the rate b increase, the horizontal scale reduce, thus squeezing everything leftwards and upwards.

Beta distribution

1 if a=b=1, beta becomes uniform distribution.

2 if a and b are both less than 1, it get a bimodal distribution with spikes at 0 and 1

3 if and b are both greater than 1, the distribution is unimodal.

Pareto distribution is used to model the distribution of quantities that exhibit long tails, i.e., heavy tails.

1 if we plot the distribution on a log-log scale, it forms a straight line, of the form logp(x)=alogx + c This is known as a power law.

Joint Probability Distribution

Uncorrelated does not imply independent.
Note that the correlation reflects the noisiness and direction of a linear relationship, but not the slope of that relationship, nor many aspects of nonlinear relationships.

Multivariate Gaussian distribution

Multivariate Student's t distribution

Dirichlet distribution
natural generalization of the beta distribution

Transformation of random variables

Linear transformation

General transformation - Jacobian matrix J

Central limit theorem

Monte Carlo approximation

Generate S samples from the distribution, e.g., MCMC(Monte Carlo Approximation), use Monte Carlo to approximate the expected value of any function of a random variable. That is, simple draw samples, and then compute the mean of the function applied tot he samples.

Information Theory

Entropy

The entropy of a random variable is a measure of its uncertainty.

H(X) = sum_1^K p(X=k)log_2 p(X=k)

The cross entropy is the average number of bits (short for binary digits- log base 2) needed to encode data coming from a source with distribution p when model q was used to encode the distribution.
The maximum entropy is the uniform distribution.

KL divergence

One way to measure the dissimilarity of two probability distributions, p and q, is known as the Kullback-Leibler divergence (KL divergence) or relative entropy.

KL(P(X)|Q(X)) = sum_x p(X=x) log_2(p(X=x)/q(X=x))
= sum_1^K p(X=k)log_2 p(X=k) - sum_1^K p(X=k)log_2 q(X=k)
=H(p) - H(p,q)

H(p,q) is called cross entropy, which is the average number of bits needed to encode data coming from a source with distribution p when we use model q to define the model. Hence a regular entropy is the expected number of bits if use the true model, the KL divergence is the difference between two models. In other words, the KL divergence is the average number of extra bits needed to encode the data, due to the fact hat use distribution q to encode the data instead of the true distribution.

Mutual Information

Mutual information of X and Y as the reduction in uncertainty about X after observing Y, or by symmetry, the reduction in uncertainty about Y after observing X.

I(X, Y) = KL(P(X,Y)|P(X)P(Y)) = sum_x sum_y p(x,y) log( p(x,y)/p(x)p(y))
=H(X) + H(Y) - H(X, Y)
= H(X, Y) - H(X|Y) - H(Y|X)

H(X) and H(Y) are the marginal entropies, H(X|Y) and H(Y|X) are the conditional entries, and H(X, Y) is the joint entropy of X and Y.

Intuitively, we can interpret the MI between X and Y as the reduction in uncertainty about X after observing Y, or by symmetry, the reduction in uncertainty about Y after observing X.

Types of Joins

Inner joins: return matched rows in both tables.

select *
from TableA A Table B B
where A.key=B.key;

Outer joins include the left joins, right joins and full joins.
Left joins: return matched rows in both tables and non-matched rows in the left table.
select *
from TableA A left join TableB B
on A.key=B.key;

select *
from TableA A left join TableB B
on A.key=B.key
where B.KEy is NULL;

Right joins: return matched rows in both tables and non-matched rows in the right table.
select *
from TabelA A right join TableB B
on A.key=B.key;

select *
from TabelA A right join TableB B
on A.key=B.key
where A.Key is NULL;

Full joins: return matched, plus all non-matched rows from both tables.
select *
from TableA A full outerjoin TableB B
on A.key=B.key;

select *
from TableA A full outerjoin TableB B
on A.key=B.key
where A.Key is NULL or B.Key is NULL;

Types of Set Operations

1 Intersect: unique rows common in both tables.

select * from one
intersect
select * from two;

Inner Join
select ...
from one a
inner join two b
on a.key = b.key

2 Union: unique rows from both tables.
select * from one
union
select * from two;

Left Join
select ...
from one a
left join two b
on a.key = b.key

select ...
from one a
left join two b
on a.key = b.key
where b.key is NULL

3 Outer Union: all rows from both tables, unique or non-unique are selected.
select * from one
outer union
select * from two;

Outer Join
select ...
from one a
full outer join two b
on a.key = b.key

4 Except: unique rows from the first table that are not in the second table.

select * from one
except
select * from two;

Outer Join

select ...
from one a
full outer join two b
on a.key = b.key
where a.key is NULL
and b.key is NULL

Creating and Modifying Tables

Create Tables

create table three as
(
char1 char(3),
date1 date10.,
num1 num
);

Insert rows
Insert into three
values('1', '01mar90'd, .33);

Insert into three
select * from one, two where one.key=two.key;

Create views: read-only
create view as
select * from one, two where one.key=two.key;

Create index
simple: based on one column.
create index idnum on three(key);
composite: based on more than one column
create index idnum on three(key1, key2);

Update Data Values
update three
set salary=salary*
case when substr(jobcode,3,1)='1' then 1.05
case when substr(jobcode,3,1)='2' then 1.08
case when substr(jobcode,3,1)='3' then 1.10
else 1.08
end;

%missingPattern to Studying Missing Data Patterns

The macro is designed to look at missing data in four ways: the proportion of subjects with each pattern of missing data, the number and percentage of missing data for each individual variable, the concordance of missingness in any pair of variables, and possible unit nonresponse.

The SAS macro is %missingPattern:

%missingPattern(datain=, varlist=, exclude=, missPattern1=, dataout1=, missPattern2=, dataout2=, missPattern3=, dataout3=, missPattern4=, dataout4=)

Reference: T. Schwartz, Q. Chen, and N. Duan, (2011). Studying missing data patterns using a SAS macro, SAS Global Forum 2011 proceedings.

Example:
data tmp;
set out1.training_motorcycle1;
if vn0451^=. then vn0451_log=log(1+vn0451);
else vn0451_log=.;
if vn0467^=. then vn0467_log=log(1+vn0467);
else vn0467_log=.;
if vn0479^=. then vn0479_log=log(1+vn0479);
else vn0479_log=.;
if vn0712^=. then vn0712_log=log(1+vn0712);
else vn0712_log=.;
if vn0717^=. then vn0717_log=log(1+vn0717);
else vn0717_log=.;
if vn0722^=. then vn0722_log=log(1+vn0722);
else vn0722_log=.;
run;

%let varlist=vn0451_log vn0467_log vn0479_log vn0712_log vn0717_log vn0722_log;

%missingPattern(datain=tmp, varlist=&varlist, exclude='FAULT', missPattern1='TRUE',
dataout1=result1,missPattern2='TRUE', dataout2=result2,
missPattern3='TRUE', dataout3=result3, missPattern4='TRUE', dataout4=result4);

ods html path = "/u/lhuang/" (url = none)
body = "temp.html"
style = Default;
Ods listing close;

proc print data=result1;
proc print data=result2;
proc print data=result3;
*proc print data=result4;
run;

* DOWNLOAD HTML FILES;
proc download infile="/u/lhuang/temp.html"
outfile="&LocalLocation/missing pattern.html";
run;

PROC CLUSTER, PROC FASTCLUS to run Variable Selection for Clustering Analysis

Variable Standardization

Standardization refers to data transformation that involves correction of variables using either means or standard deviations or both, depending on the data and context of analysis. SAS PROC STDIZE allows users to standardize a set of variables using several criteria, including the following methods:

RANGE standardization is most helpful when variables are measured on different scales. Variables with large values and a wide range of variation have significant effects on the final similarity measure. Hence, it is essential to make sure that each variable is evenly constituted in the distance measurement by means of data standardization.

MEAN/MEADIAN offers centering refers to variables being adjusted using only the mean or median across the variables. The variable mean or median is subtracted from its original score. Standard deviation is not adjusted in this process.

STD standardization refers to correction of variables using the variable mean and standard deviation. The variable mean is subtracted from its original score and then divided by the standard deviation.

L (p) refers to adjusting the scores using Minkowski distance. For example, if p=2, then Euclidean distance is applied.

proc stdize data=dataset method=range out=outdataset outstat=stats;

var &varlist;

run;

Standardization to unit variability can be seen as a special case of weighting, where the weights are the reciprocals of variability. In some cases, however, defining weights as inversely proportional to some measure of total variable can actually dilute the difference between clusters. This is major reason why weights based on sample range are usually more effective when clustering than weights based on the standard deviation.

Clustering Methodology

Types of Models

SAS offers different clustering algorithms including k-means clustering, non-parametric clustering, and hierarchical clustering.

1. k-means clustering is, perhaps, the most popular partitive clustering algorithm. One reason for its popularity is that the time required to reach convergence is proportion to the number of observations being clustered, which means it can be used to cluster larger data sets. However, k-means clustering is inappropriate for small data sets (<100 cases). The solution becomes sensitive to the order of the observations and the variables, which is known as order effect.

title 'K-Means Clustering using Adaptive Training';

proc fastclus data=dataset maxclusters=5 maxiter=100 least=2 drift replace=full distance out=cluster;

var &varlist;

run;

Option MAXCLUSTERS=specifies the maximum number of clusters allowed. Instead of using MAXCLUSTERS=, option RADIUS= is also able to establishe the minimum distance criterion for selecting new seeds. No observation is considered as a new seed unless its minimum distance to previous seeds exceeds the value given by the RADIUS= option. The default value is 0.

Option MAXITER= specifies the number of iterations to facilitate convergence. By default, MAXITER=1. When the value of the MAXITER= option is greater than 0, each observation is assigned to the nearest seed, and the seeds are recomputed as the means of the clusters.

Option LEAST= specifies distance metrics. By default, LEAST=2, which is Euclidean distance. LEAST=1 implements city block distance, and LEAST=MAX implements maximum absolute deviation. In other words, PROC FASTCLUS requires numeric data, and be sensitive to extreme values, so standardization in data preparation is important.

Option DRIFT is specified the closest seed moves as each case is assigned to it.

Option REPLACE= specifies how seed replacement is performed. Option REPLACE=FULL requests default seed replacement. Option REPLACE=PART requests seed replacement only when the distance between the observation and the closest seed is greater than the minimum distance between seeds. Option REPLACE=NONE suppresses seed replacement. Option REPLACE=RANDOM selects a simple pseudo-random sample of complete observations as initial cluster seeds.

Option DISTANCE computes distances between the cluster means.

2. Nonparametric methods can detect clusters of unequal size and dispersion, or clusters with irregular shapes. If the covariance matrices are unequal or radically non-normal, nonparametric density estimation is often the best approach. And nonparametric methods are less sensitive than most clustering techniques to changes in scale.

title 'Nonparametric Clustering';

proc modeclus data=dataset method=1 r=5.75330 join out=nonpar_results;

var &varlist;

run;

Option R= specifies the radius of the sphere of support for uniform-kernel density estimation and the neighborhood for clustering. Option METHOD= specifies what clustering method to use. For most purposes, METHOD=1 is recommended.

3. Hierarchical methods form the backbone of cluster analysis. The popularity of hierarchical method is partly due to the fact that they are not subject to the order effect. And also, some hierarchical methods can even recover irregular clusters directly. Unfortunately, hierarchical methods often require processing times on the order of the square. This limits their use to small and mid-sized data sets.

title2'Hierarchical Solution (Average)';
proc cluster data=dataset method=average outtree=tree simple rmsstd rsquare;
var &varlist;
copy customer_key;
run;

Option SIMPLE displays simple, descriptive statistics.

The METHOD= specification determines the clustering method used by the procedure. The example used average method. Other popular methods include single, centriod, twostage, and ward.

Option RMSSTD displays the pooled standard deviation of all the variables of each cluster. Since the objective of cluster analysis is to form homogeneous groups, the RMSSTD of a cluster should be as small as possible.

Option RSQUARE displays the R-squared and semi-partial R-squared to evaluate cluster solution. R-squared measures the extent to which groups or clusters are different from each other (so, when you have just one cluster R-squared value is, intuitively, zero). Thus, the R-squared value should be high. semi-partial R-squared is the loss of homogeneity due to combining two groups or clusters to form a new group or cluster. Thus, the semi-partial R-squared value should be small to imply that we are merging two homogeneous groups.

proc tree data=tree nclusters=5 dock=5 out=results2 noprint;
copy &varlist;
run;

PROC TREE procedure produces a tree diagram, also known as a dendrogram or phenogram, using a data set created by PROC CLUSTER. PROC CLUSTER creates output data sets that contain the results of hierarchical clustering as a tree structure. PROC TREE uses the output data set to produce a diagram of the tree structure.

Option NCLUSTERS= specifies the number of clusters desired in the OUT= data set.

The COPY statement specifies one or more character or numeric variables to be copied to the OUT= data set.

Sample Size Calculation using PROC POWER for proportions under CRD

Q: How to ensure you get the correct sample size?

A: Leverage power analysis to determine sample size.

One of the pivotal aspects of planning an experiment is the calculation of the sample size. It is naturally neither practical nor feasible to study the whole population in any study. Hence, a set of users is selected from the population, which is less in number (size) but adequately represents the population from which it is drawn so that true inferences about the population can be made from the results obtained. This set of individuals is known as the “sample" int DOE (design of experiment).

It is a basic statistical principle with which we define the sample size before we start a study so as to avoid bias in interpreting results. If we include very few users in a study, the results cannot be generalized to the population as this sample will not represent the size of the target population. Further, the study then may not be able to detect the difference between test groups, making the study unethical. On the other hand, if we study more subjects than required, we put more users than needed, also making the study wasting precious resources, including the researchers’ time.

Generally, the sample size for any study depends on the:

Acceptable level of significance
Power of the study
Expected effect size
Expected variance

Level of Significance

This is the pre-defined threshold value to reject the null hypothesis. Usually p<0.05 will be taken as statistically significant to accept that the result is observed due to chance. To put in different words, it is possible to accept the detection of a difference 5 out of 100 times when actually no difference exists (i.e., get a false positive result). It is denoted by letter α(alpha) as Type 1 error.

Power

Exactly conversely, type II of error indicate failing to detect a difference when actually there is a difference (i.e., false negative result). The false negative rate is the proportion of positive instances that were erroneously reported as negative and is referred to in statistics by the letter β(beta). The power of the study then is equal to (1 –β) and is the probability of failing to detect a difference when actually there is a difference. The power of a study increases as the chances of committing a Type II error decrease. Usually most studies accept a power of 80%. This means that we are accepting that one in five times (that is 20%) we will miss a real difference.

Expected effect size

The effect size is the minimum deviation from the null hypothesis that the hypothesis test set up to detect. Suppose you are comparing different responses between treatment group and control group, and the measuring metrics are group means denoted as μ1 and μ2, the expected effect size equals to abs( μ1- μ2). Generally speaking, the smaller the difference you want to detect, the larger the required sample size.

Expected variance

The variance describes the variability/noise among measurement units. As variance gets larger, it gets harder to detect a significant difference, so the bigger sample size requires. The pilot study or similar study could help to determine the expected variance.

Reference

Design and Analysis of Experiments. by Montgomery, D.C.X (Hoboken, New Jersey: John Wiley & Sons, Inc., 2000)

* Power with One Sample;

proc power;
oneSampleFreq
/* hypotheses of interest */
nullProportion=0.1
proportion=0.2
sides=1
/* decision rule */
alpha=0.3
/* sample size */
nTotal=10 20 25 30 40 50 100
power=.
;
run;

*normal approximation;
proc power;
oneSampleFreq
/* hypotheses of interest */
nullProportion=0.1
proportion=0.2
sides=1
/* decision rule */
alpha=0.3
/* solve for sample size */
nTotal=.
power=.80
/* different distributional assumptions */
test=z
method=normal
;
run;

* Power with Two Samples;
** SAS
proc power;
twoSampleFreq
/* hypotheses of interest */
refProportion=0.01
proportionDiff=0.001/*first factor difference*/ .0025/*second factor difference*/
sides=1 2
/* decision rule */
alpha=0.05
/* sample size */
nTotal=.
power=.6 .99
;
plot y=power
yopts=(crossref=YES ref=.8)
vary(color by proportionDiff,
symbol by sides);
run;

proc power;
twoSampleFreq
/* hypotheses of interest */
refProportion=0.1
proportiondiff=0.1
sides=1
/* decision rule */
alpha=0.05
/* sample size */
nTotal=.
power=.6 .8 .99
/* how balanced the test is */
groupWeights=(1 1) (10 15) (1 2) (1 3) (1 10)
;
plot;
run;

** R
install.packages("pwr")
require(pwr)

delta <- 20
sigma <- 60
d <- delta/sigma
pwr.t.test(d=d, sig.level=.05, power = .80, type = 'two.sample')

PROC POWER, PROC GLMPOWER for proportions under CRD

* Balance with Two Factor;

proc power;
twosamplefreq
refproportion= 0.010
proportiondiff=0.001 0.0025
ntotal=.
power=.8;
run;

data work.a;
input intro $1-4
goto $6-9
responseRate;
variance= responseRate*(1-responseRate);
datalines;
LOW LOW .0135
LOW HIGH .0125
HIGH LOW .011
HIGH HIGH .010
;
run;

proc glmpower data=work.a;
class intro goto;
model responseRate=intro|goto;
power
power=.8
ntotal=.
stddev=%sysfunc(sqrt(.01*.99))
%sysfunc(sqrt(.011*.989));
run;

* Three Two-Level Factors;
* 8 trts;
data work.dem0302;
do a=-1 to 1 by 2;
do b=-1 to 1 by 2;
do c=-1 to 1 by 2;
respRate=.01+.00050*sum(a,b,c);
output;
end;
end;
end;
run;

proc print data=work.dem0302;
run;

proc glmpower data=work.dem0302;
class a b c;
model respRate=a|b|c;
power
power=.8
ntotal=.
stddev=%sysfunc(sqrt(.01*.99));
run;

*Analysis Examples:
data work.ex0302;
informat responseRate percent6.4;
input interestRate $1-4
sticker $6-8
priceGraphic $10-14
responseRate;
datalines;
LOW YES SMALL 1.00%
LOW YES LARGE 1.20%
LOW NO SMALL 0.85%
LOW NO LARGE 1.00%
HIGH YES SMALL 0.80%
HIGH YES LARGE 1.00%
HIGH NO SMALL 0.60%
HIGH NO LARGE 0.75%
;
run;

*N=4,973,040;
proc glmpower data=work.ex0302;
class interestRate Sticker priceGraphic;
model responseRate=interestRate|Sticker|priceGraphic;
power
power=.8
ntotal=.
stddev=%sysfunc(sqrt(.01*.99));
run;

proc logistic data=s.two3;
class interestRate Sticker priceGraphic;
model orders/volume=
interestRate|Sticker|priceGraphic @1
interestRate|Sticker|priceGraphic @2
interestRate|Sticker|priceGraphic;
output out=work.plot p=p l=l u=u;
run;

proc QLIM, proc LOGISTIC, PROC GENMOD, PROC GLIMMIX to fit Population-averaged model vs. Subject-specific model

The GLM consists of three elements:

1. A probability distribution from the exponential family.

2. A linear predictor η = Xβ .

3. A link function g such that E(Y) = μ = g^-1(η).

For GLM, there are two different models: population-averaged model without mix effects, subject-specific model with mix effects.

The population-averaged approach is focused on modeling the mean response across the population of units at each time point as a function of time. Thus, the model described how the averages across the population of responses at different time points are related over time.

The subject-specific approach is focused on modeling the individual unit trajectories rather than the mean across all units. We did this by the introduction of random effects, e.g., the random coefficient model that says each unit has its own intercept and slope.

The MLE of beta is the solution of the normal equation, (sum_i x_i inv(sigma) (y_i-x_i*beta)=0). It is consistent and asymptotically normal with estimated asymptotic variance matrix H. There are three types of variances can be computed.
First, Hessian matrix H=-der^2.(l(beta))/der(beta)der(beta')
Second, asymptotic variance matrix of outer product B=der.(l(beta))/der(beta). der.(l(beta))/der(beta').
Third, asymptotic robust-sandwich estimate H^(-1)BH^(-1).

proc QLIM compute all three variances.
proc Logistic compute Hessian variance.
proc GENMOD and proc GLIMMIX compute Hessian variance and robust-sandwich variances.

Proc QLIM to fit Tobit Models

Standard Tobit Model:

yi=yi^star*I(yi^star>0), where yi^star=xi*beta+ei, ei~N(0,sigma^2).
The inverse Mills ratio, also called the selection hazard, is used to take account of a possible selection bias.
IMratio=f(xi*beta/sigma)/Pi(xi*beta/sigma), f is pdf, Pi is cdf of normal distribution. As the probability of censoring increases, the ratio approaches infinity. As the probability of censoring decreases, the ratio approaches 0.

The likelihood function of Tobit models consists of two parts: the first part is the likelihood function for OLS regression and the second part is the likelihood function for a Probit model. If there was no censoring, then the tobit model would produce unbiased OLS estimates. But if there was censoring, the Tobit model would not only predict the non-censored response values, but also the estimated probability that the observation is censored.

Probit Model:
Pi^(-1)(pi)=xi*beta.
The probit link function transforms a probability to a standard normal z-score at which the left-tailed probability equals the posterior probability. Pi^(-1)(0.5)=0, Pi^(-1)(1.96)=.0975.

Proc QLIM ( Qualitative and Limited dependent variable Model) analyzes univariate and multivariate limited dependent variables where dependent variables are observed in a limited range of values.
Option ENDOGENOUS CENSORED specifies the dependent variable is censored.
Option HETERO specifies the ways of heteroscedasticity of the residuals.
Option ENDOGENOUS TRUNCATED specifies the dependent variable is truncated. In a truncated distribution, the values of the predictor variables are known only when the response variable is observed. This differs from censoring where the predictor variables are known even when the response variable is unknown.

* Tobit Model in PROC QLIM;
** ENDOGENOUS CENSORED (lower bound=0);
proc qlim data=pva;
class gender;
format gender $gender.;
model donation = gender age income_group wealth_rating pep_star months_since_last_gift median_home_value recent_avg_gift_amt recent_card_response_prop card_prom_12 recent_response_count;
endogenous donation ~ censored(lb=0);
output out=selection conditional expected marginal predicted xbeta mills;
title 'Tobit Model in PROC QLIM';
run;

* Tobit Model Corrected for Heteroscedasticity;
**HETERO specified variance function Var(ei);
proc qlim data=pva;
class gender;
format gender $gender.;
model donation= gender age income_group wealth_rating
pep_star months_since_last_gift
median_home_value recent_avg_gift_amt
recent_card_response_prop card_prom_12
recent_response_count;
endogenous donation ~ censored(lb=0);
hetero donation~pep_star months_since_last_gift
recent_avg_gift_amt card_prom_12
recent_response_count;
output out=selection xbeta;
title "Tobit Model Corrected for Heteroscedasticity";
run;

* Tobit Model in PROC QLIM;
** ENDOGENOUS CENSORED (lower bound=500 upper bound=1500);
proc qlim data=censored_aids;
model basecd4 = age cigarettes drug partners depression;
endogenous basecd4 ~ censored(lb=500 ub=1500);
title 'Censored Distribution in PROC QLIM';
run;

* Tobit Model in PROC QLIM;
** ENDOGENOUS CENSORED (lower bound=500 upper bound=1500);
proc qlim data=censored_aids;
model basecd4 = age cigarettes drug partners depression;
endogenous basecd4 ~ censored(lb=500 ub=1500);
hetero basecd4 ~ cigarettes;
output out=selection xbeta;
title 'Censored Distribution in PROC QLIM';
run;

* Tobit Model in PROC QLIM;
**Truncated Regression Model;
proc qlim data=housing;
model median_value = crime_rate large_lots nonretail charles_river nitric_oxide
average_number_rooms percent_lower_status
distance_Boston teacher_pupil_ratio access_highway;
endogenous median_value ~ truncated (ub=25);
output out=selection conditional marginal predicted mills xbeta errstd;
title "Truncated Regression Model for the Boston Housing Data";
run;

* Tobit Model in PROC QLIM;
*Truncated Regression Model Accounting for Heteroscedasticity;
proc qlim data=housing;
model median_value = crime_rate large_lots nonretail charles_river nitric_oxide
average_number_rooms percent_lower_status
distance_Boston teacher_pupil_ratio access_highway;
endogenous median_value ~ truncated (ub=25);
hetero median_value~ percent_lower_status distance_Boston;
output out=selection xbeta;
title "Truncated Regression Model for the Boston Housing Data Accounting for Heteroscedasticity";
run;

Sample selection Models;
In this model, the response variable is only observed when some selection criterion is met. The selection equation has a latent response variable which includes predictor variable, coefficients and an error term.

* Tobit Model in PROC QLIM;
**Sample Selection Model ;
proc qlim data=mroz;
model labor_force = age education experience kidslt6 income / discrete;
model wage = experience experience_sq education marg_tax_rate / select(labor_force=1);
hetero wage ~ experience education;
title "Sample Selection Model of Married Female Wages in Labor Force";
run;

PROC DISCRIM to fit Linear Discriminant Analysis (LDA) , Quadratic Discriminant Analysis (QDA)

LDA

LDA attempts to express one dependent variable as a linear combination of other features or measurements.

LDA explicitly attempts to model the difference between the classes of data.

LDA for two classes

PROC DISCRIM DATA=iris;

CLASS Species;
Var X1-XK;

RUN;

PROC DISCRIM DATA=Train TESTDATA=Test TESTOUT=Pred;
CLASS Species;
Var X1_XK;
RUN;

Multivariate normal distribution assumptions holds for the response variables. This means that each of the dependent variables is normally distributed within groups, that any linear combination of the dependent variables is normally distributed, and that all subsets of the variables must be multivariate normal.
Each group must have a sufficiently large number of cases.
Different classification methods may be used depending on whether the variance-covariance matrices are equal (or very similar) across groups.

Difference btw LDA and logistic regression

LDA operates by maximizing the log-likelihood based on an assumption of normality and homogeneity
Logistic regression makes no assumption about Pr(X), and estimates the parameters of Pr(G|x) by maximizing the conditional likelihood
Intuitively, it would seem that if the distribution of x is indeed multivariate normal, then we will be able to estimate our coefficients more efficiently by making use of that information by using LDA.
On the other hand, logistic regression would presumably be more robust if LDA’s distributional assumptions are violated

QDA

QDA is assumed that the measurements from each class are normally distributed.When the normality assumption is true, the best possible test for the hypothesis that a given measurement is from a given class is the likelihood ratio test.

Proc LOGISTIC, PROC GENMOD, PROC GLIMMIX to fit Binary Logit, Cumulative Logit and Cumulative Probit Models

Binary Logit Models

Suppose there are J-levels of the outcome Yi, 1..J. The cumulative probabilities of Yi, pj(xi)=Pr(Yi<=j|xi), reflect the ordering, with p1(xi)<=p2(xi)<=...<=pJ(xi)=1.

Proc logistic, genmod and glimmix with the option link=cumlogit will fit the cumulative logit model s.t. log(pj(xi)/(1-pj(xi))=aj+xi.beta. The parameter beta describe the effect of a covariate on the log odds of response in the category j or below.

proc logistic data=test;
class vn0476_M(ref='female') vn0435_34(ref='rent') vn0455_1(ref='unmarried')/param=ref;
model product_pref=&numeric &norminal/link=cumlogit;
format vn0476_M gender. vn0435_34 rent. vn0455_1 marital.;
run;

Cumulative Probit Models

Proc logistic, genmod and glimmix with the option link=cumprobit will fit the cumulative probit model s.t. Pi^(-1)=ai+xi.beta.

proc logistic data=test;
class vn0476_M(ref='female') vn0435_34(ref='rent') vn0455_1(ref='unmarried')/param=ref;
model product_pref=&numeric &norminal/link=cumprobit;
format vn0476_M gender. vn0435_34 rent. vn0455_1 marital.;
run;

Cumulative logit and cumulative probit models assume the effect of a covariate is identical for all J-1 cumulative logits, which is called the proportional odds property.

PROC QLIM to fit Ordered Logit Models and Ordered Probit Models

Suppose that uj, 0..J such that -inf=u0<u1<...<uJ=inf. The observed outcome is a categorization of a latent variable Yi^star=xi.beta+ei s.t. Yi=j iff uj-1<Yi^star<=uj. The probability of response is pj(xi)=Pr(Yi=j|xi)=F(uj-xi.beta)-F((uj-1)-xi.beta).

In the ordered logit model s.t. log(pj(xi)/(1-pj(xi))=uj-xi.beta. The parameter beta describe the effect of a covariate on the log odds of response in the category j, or marginal effect of the covariate E(Yi^star|xi).
In the cumulative probit model s.t. Pi^(-1)=uj-xi.beta.

*Ordered Logit;
proc logistic data=test desc;
class vn0476_M(ref='female') vn0435_34(ref='rent') vn0455_1(ref='unmarried')/param=ref;
model product_pref=&numeric &norminal;
format vn0476_M gender. vn0435_34 rent. vn0455_1 marital.;
run;

Proc qlim fits the ordered logit and ordered probit models. It uses the latent variable formulation. By default, an intercept is included in beta and the first threshold parameter u1=0. The model option limit1=varying overrides the default.

proc qlim data=test covest=qml;
class vn0476_M vn0435_34 vn0455_1;
endogenous product_pref ~ discrete (dist=logistic order=formatted);
*hetero product_pref ~&numeric;
model product_pref=&numeric &norminal/limit1=varying;
format vn0476_M gender. vn0435_34 rent. vn0455_1 marital.;
run;

proc qlim fit the equivalent homoscedastic ordered logit model. However, the signs for the covariates for intercepts are reversed.

Ordered logit and ordered probit models assume the effect of a covariate is identical for all J-1 cumulative logits, which is called the proportional odds property.

Wednesday, November 9, 2011

The following SAS codes is to use Proc cluster to generate random seeds (centroids/initial points), and then use Proc fastclus to create the clusters.

Reference

* Balance with Two Factor;

The GLM consists of three elements:

Standard Tobit Model:

Binary Logit Models

Suppose there are J-levels of the outcome Yi, 1..J. The cumulative probabilities of Yi, pj(xi)=Pr(Yi<=j|xi), reflect the ordering, with p1(xi)<=p2(xi)<=...<=pJ(xi)=1.

Blog Archive