Saturday, February 4, 2012

SQL to Compute User Age


drop table tmptbl;
create table lh_tmptbl as
select aa.userhashkey, min(aa.datekey) as firstdt
from factevent aa
where aa.userhashkey % 100 = 0 and aa.eventtypeid = 1 and aa.uac = 1 and aa.datekey <= 20130728
group by 1
having min(aa.datekey)>=20130522 and min(aa.datekey)<=20130628;

drop table tmp2;
create table tmp2 as
select  tbl1.firstdt,
        TO_DATE(cast(tbl1.datekey as VARCHAR), 'YYYYMMDD')-  TO_DATE(cast(tbl1.firstdt as VARCHAR), 'YYYYMMDD') as age,
        count(*) as imps
from (
select b.firstdt, a.datekey, a.userhashkey, a.eventkey
from factevent a,
     tmptbl b
where a.userhashkey = b.userhashkey and a.eventtypeid = 1 and a.uac = 1
and a.datekey <= 20130728
) tbl1
group by 1,2
;

drop table tmp3;
create table tmp3 as
select tbl1.firstdt,
count(distinct userhashkey) as uusers,
count(*) as imps
from (
select b.firstdt, a.datekey, a.userhashkey, a.eventkey
from factevent a,
     lh_tmptbl b
where a.userhashkey = b.userhashkey and a.eventtypeid = 1 and a.uac = 1
and a.datekey <= 20130728
) tbl1
group by 1;

-- R for triangle output;
sql <- paste("select a.*, b.uusers, b.imps as totimp
             from tmp2 a,
                  tmp3 b
             where a.firstdt = b.firstdt
             order by a.firstdt, a.age;" , sep ='');
data <- sqlQuery(con, sql, as.is = T, max =0 )
data$age <- as.integer(data$age)
head(data)
x <- dcast(data, firstdt+uusers+totimp ~ age, value.var = "imps", fill = 0);
head(x)
write.csv(x, "totimp.csv")

#break down by weeks;
sql <- paste("select a.firstdt, b.uusers,
             (case when a.age = 0 then 1
                  when a.age >=1 and a.age <=7 then 2
                  when a.age >=8 and a.age <=30 then 3
                  when a.age >=31 then 4 end) as agegrp,
             sum(a.imps) as grpimp
             from tmp2 a,
                  tmp3 b
             where a.firstdt = b.firstdt
             and a.firstdt < 20130729
             group by 1,2,3
             order by 1,2,3;" , sep ='');
data <- sqlQuery(con, sql, as.is = T, max =0 )
head(data)
x <- dcast(data, firstdt+uusers ~ agegrp, value.var = "grpimp", fill = 0);
head(x)
write.csv(x, "totimp2.csv")

-- 30-30-30 rules
create table userfiltering as
select userhashkey as userid, sectionid, datekey
from database1 join database2 using(sectionkey)
where userhashkey % 1000 = 0 and userhashkey <> 0;

create table userfiltering2 as
select *,
row_number() over(partition by userid order by userid, datekey, sectionid) user_cumcnt,
row_number() over(partition by userid, sectionid order by userid, datekey, sectionid) user_fact_cumcnt,
row_number() over(partition by userid, datekey order by userid, datekey, sectionid) user_dt_cumcnt,
row_number() over(partition by userid, datekey, sectionid order by userid, datekey, sectionid) user_fact_dt_cumcnt
from userfiltering
order by userid, datekey, sectionid;
select min(datekey), max(datekey)
from userfiltering2;

--cutoff1=20111128, cutoff2=20111229, gaps=31;
create table userfiltering3 as
select userid,
min(datekey) as first_dt,
max(datekey) as last_dt,
to_date(max(datekey),'YYYYMMDD') - to_date(min(datekey),'YYYYMMDD')as gaps,
(case when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=0  then 1
 when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>0 and (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=31 then 2
 when (to_date(min(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>31  then 3 end) as index1,
(case when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=0  then
when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>0 and (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))<=31 then 2
when (to_date(max(datekey),'YYYYMMDD') - to_date(20111128,'YYYYMMDD'))>31  then 3 end) as index2
from userfiltering2
group by userid
order by userid;

--before 30-30-30 rules;
select count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3;

--apply 30-30-30 rules;
select count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3
where index1=2 and index2=2;

--break down by first-dt cohort;
select
first_dt,
count(*),
sum(case when gaps>=0 then 1 else 0 end)/count(*) as duration0,
sum(case when gaps>=1 then 1 else 0 end)/count(*) as duration1,
sum(case when gaps>=3 then 1 else 0 end)/count(*) as duration3,
sum(case when gaps>=7 then 1 else 0 end)/count(*) as duration7,
sum(case when gaps>=14 then 1 else 0 end)/count(*) as duration14,
sum(case when gaps>=30 then 1 else 0 end)/count(*) as duration30,
sum(case when gaps>=60 then 1 else 0 end)/count(*) as duration60
from userfiltering3
where index1=2 and index2=2
group by 1
order by 1;

No comments:

Post a Comment