Wednesday, February 8, 2012

LME to estimate Mixed Effect Models in R

In common marketing discussion, a hierarchical model estimates both group level effects and individual differences in effects. Such models are popular in marketing because they provide insight into differences among customers (heterogeneity) and distribution of preference. HLM are exemplified when we estimate the importance of effects for individuals as well as for an overall population. 

Effects that are associated with all observations are known as fixed effects, and those that differ across various grouping levels are known as random effects. 

These models are also known as mixed effect models, because the total effect for each person is composed of the effect for the overall population ( the fixed effect) plus the per-individual (random) effect. 

The difference between estimating hierarchical effects, as opposed to including the grouping variable as a factor in a standard linear model, is that a hierarchical model estimates every specified effect for each individual or group, not only a single adjustment term. 

The formula for a mixed effect model includes a grouping term + (... |group). Common models have a different intercept by group using (1|group) or different intercepts and slopes for predictors within each group using (predictor|group). To estimate an individual level model, the grouping term is typically the respondent identifier. 


Hierarchical model can be used to group observations at other levels than the individual level. For example, we might wish to group by store, advertising campaign, salesperson, or some other factor, if we went o estimate effects that are specific to such a grouping. 


Hierarchical models in marketing are often estimated with Bayesian methods that are able to pool information and produce best estimates of both group and individual effects using potentially sparse data. 

Model coefficients from a hierarchical model are inspected using summaries of the many estimates that are collected in an mcmc object.

library(nlme)
model1<-lme(mathach ~ 1, random = ~ 1 | id, data=hsb)
summary(model1)

Saturday, February 4, 2012

SQL to Compute User Age


drop table tmptbl;
create table lh_tmptbl as
select aa.userhashkey, min(aa.datekey) as firstdt
from factevent aa
where aa.userhashkey % 100 = 0 and aa.eventtypeid = 1 and aa.uac = 1 and aa.datekey <= 20130728
group by 1
having min(aa.datekey)>=20130522 and min(aa.datekey)<=20130628;

drop table tmp2;
create table tmp2 as
select  tbl1.firstdt,
        TO_DATE(cast(tbl1.datekey as VARCHAR), 'YYYYMMDD')-  TO_DATE(cast(tbl1.firstdt as VARCHAR), 'YYYYMMDD') as age,
        count(*) as imps
from (
select b.firstdt, a.datekey, a.userhashkey, a.eventkey
from factevent a,
     tmptbl b
where a.userhashkey = b.userhashkey and a.eventtypeid = 1 and a.uac = 1
and a.datekey <= 20130728
) tbl1
group by 1,2
;

drop table tmp3;
create table tmp3 as
select tbl1.firstdt,
count(distinct userhashkey) as uusers,
count(*) as imps
from (
select b.firstdt, a.datekey, a.userhashkey, a.eventkey
from factevent a,
     lh_tmptbl b
where a.userhashkey = b.userhashkey and a.eventtypeid = 1 and a.uac = 1
and a.datekey <= 20130728
) tbl1
group by 1;

-- R for triangle output;
sql <- paste("select a.*, b.uusers, b.imps as totimp
             from tmp2 a,
                  tmp3 b
             where a.firstdt = b.firstdt
             order by a.firstdt, a.age;" , sep ='');
data <- sqlQuery(con, sql, as.is = T, max =0 )
data$age <- as.integer(data$age)
head(data)
x <- dcast(data, firstdt+uusers+totimp ~ age, value.var = "imps", fill = 0);
head(x)
write.csv(x, "totimp.csv")

#break down by weeks;
sql <- paste("select a.firstdt, b.uusers,
             (case when a.age = 0 then 1
                  when a.age >=1 and a.age <=7 then 2
                  when a.age >=8 and a.age <=30 then 3
                  when a.age >=31 then 4 end) as agegrp,
             sum(a.imps) as grpimp
             from tmp2 a,
                  tmp3 b
             where a.firstdt = b.firstdt
             and a.firstdt < 20130729
             group by 1,2,3
             order by 1,2,3;" , sep ='');
data <- sqlQuery(con, sql, as.is = T, max =0 )
head(data)
x <- dcast(data, firstdt+uusers ~ agegrp, value.var = "grpimp", fill = 0);
head(x)
write.csv(x, "totimp2.csv")

-- 30-30-30 rules
create table userfiltering as
select userhashkey as userid, sectionid, datekey
from database1 join database2 using(sectionkey)
where userhashkey % 1000 = 0 and userhashkey <> 0;

create table userfiltering2 as
select *,
row_number() over(partition by userid order by userid, datekey, sectionid) user_cumcnt,
row_number() over(partition by userid, sectionid order by userid, datekey, sectionid) user_fact_cumcnt,
row_number() over(partition by userid, datekey order by userid, datekey, sectionid) user_dt_cumcnt,
row_number() over(partition by userid, datekey, sectionid order by userid, datekey, sectionid) user_fact_dt_cumcnt
from userfiltering
order by userid, datekey, sectionid;
select min(datekey), max(datekey)
from userfiltering2;

--cutoff1=20111128, cutoff2=20111229, gaps=31;
create table userfiltering3 as
select userid,
min(datekey) as first_dt,
max(datekey) as last_dt,
to_date(max(datekey),'YYYYMMDD') - to_date(min(datekey),'YYYYMMDD')as gaps,
(case when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=0  then 1
 when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>0 and (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=31 then 2
 when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>31  then 3 end) as index1,
(case when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=0  then
when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>0 and (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=31 then 2
when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>31  then 3 end) as index2
from userfiltering2
group by userid
order by userid;

--before 30-30-30 rules;
select count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3;

--apply 30-30-30 rules;
select count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3
where index1=2 and index2=2;

--break down by first-dt cohort;
select
first_dt,
count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3
where index1=2 and index2=2
group by 1
order by 1;