Monday, January 16, 2012

SQL to Self Join

-- Mail Solicitation;
-- create features;
drop table ANALYTICS_STG..CAMPAIGN_ADDRESS_3;
-- 2,038,831,646 rows affected
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_3 as
select a.*,
row_number() over(partition by a.EMAIL_ADDRESS_ID order by a.EMAIL_ADDRESS_ID, a.SEND_TS) seq_freq,
extract(DAY from a.SEND_TS - LAG(a.SEND_TS, 1) OVER (partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEND_TS)) as recency,
b.reach_freq as tot_freq
from ANALYTICS_STG..CAMPAIGN_ADDRESS_2 a
join (select EMAIL_ADDRESS_ID, count(*) as reach_freq from ANALYTICS_STG..CAMPAIGN_ADDRESS_2 group by 1) b
on a.EMAIL_ADDRESS_ID = b.EMAIL_ADDRESS_ID;


-- Mail Response;
-- Dedupe response;
create table ANALYTICS_STG..RESPONSE_ADDRESS_2 as
select a.*,
case when upper(a.FEEDBACK_EVENT_CD)='OPEN' then 1 else 0 end FEEDBACK_EVENT_CD_1,
case when upper(a.FEEDBACK_EVENT_CD)='CLICK' then 1 else 0 end FEEDBACK_EVENT_CD_2,
case when upper(a.FEEDBACK_EVENT_CD)='UNDEL' then 1 else 0 end FEEDBACK_EVENT_CD_3,
case when upper(a.FEEDBACK_EVENT_CD)='CONVERSION' then 1 else 0 end FEEDBACK_EVENT_CD_4,
case when upper(a.FEEDBACK_EVENT_CD)='UNSUB' then 1 else 0 end FEEDBACK_EVENT_CD_5,
case when upper(a.FEEDBACK_EVENT_CD)='SPAM' then 1 else 0 end FEEDBACK_EVENT_CD_6
from (
select *, row_number()
over (partition by CAMPAIGN_ID, ELECTRONIC_ADDRESS_ID, CONCEPT_ID, FEEDBACK_EVENT_CD, MAIL_TS
order by  CAMPAIGN_ID, ELECTRONIC_ADDRESS_ID, CONCEPT_ID, FEEDBACK_EVENT_CD, MAIL_TS, FEEDBACK_EVENT_TS
) row_id
from ANALYTICS_STG..RESPONSE_ADDRESS_1
) a
where a.row_id = 1;

-- create response features
create table ANALYTICS_STG..RESPONSE_ADDRESS_3 as
select a.*,
row_number() over (partition by a.ELECTRONIC_ADDRESS_ID order by a.ELECTRONIC_ADDRESS_ID, a.MAIL_TS, a.FEEDBACK_EVENT_TS) seq_engagement,
b.tot_engagement,
b.tot_open,
b.tot_click,
b.tot_unsub,
extract(DAY from (a.FEEDBACK_EVENT_TS - a.MAIL_TS)) as feedback_day
from ANALYTICS_STG..RESPONSE_ADDRESS_2 a
inner join (select ELECTRONIC_ADDRESS_ID,
count(*) as tot_engagement,
sum(FEEDBACK_EVENT_CD_1) as tot_open,
sum(FEEDBACK_EVENT_CD_2) as tot_click,
sum(FEEDBACK_EVENT_CD_5) as tot_unsub
from ANALYTICS_STG..RESPONSE_ADDRESS_2
where FEEDBACK_EVENT_CD_1 = 1 or FEEDBACK_EVENT_CD_2 = 1 or FEEDBACK_EVENT_CD_5 = 1
group by 1
) b
on a.ELECTRONIC_ADDRESS_ID = b.ELECTRONIC_ADDRESS_ID
where a.FEEDBACK_EVENT_CD_1 = 1 or a.FEEDBACK_EVENT_CD_2 = 1 or a.FEEDBACK_EVENT_CD_5 = 1;

-- Join solicitations and responses
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_1 as
select a.*,
b.FEEDBACK_EVENT_TS, 
b.FEEDBACK_EVENT_CD, 
b.FEEDBACK_EVENT_CD_1,
b.FEEDBACK_EVENT_CD_2,
b.FEEDBACK_EVENT_CD_5,
b.SEQ_ENGAGEMENT,
b.TOT_CLICK,
b.TOT_OPEN,
b.TOT_ENGAGEMENT,
b.TOT_UNSUB,
b.FEEDBACK_DAY
from  ANALYTICS_STG..CAMPAIGN_ADDRESS_3 a
inner join ANALYTICS_STG..RESPONSE_ADDRESS_3 b
on a.CAMPAIGN_ID = b.CAMPAIGN_ID
and a.CONCEPT_ID = b.CONCEPT_ID
and a.EMAIL_ADDRESS_ID = b.ELECTRONIC_ADDRESS_ID
and a.SEND_TS = b.MAIL_TS
;

-- Last attributions;
create table ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_2 as
select a.*,
b.FEEDBACK_EVENT_TS, 
b.FEEDBACK_EVENT_CD, 
b.FEEDBACK_EVENT_CD_1, 
b.FEEDBACK_EVENT_CD_2, 
b.FEEDBACK_EVENT_CD_5, 
b.TOT_ENGAGEMENT, 
b.SEQ_ENGAGEMENT, 
b.TOT_CLICK, 
b.TOT_OPEN, 
b.TOT_UNSUB,
b.FEEDBACK_DAY,
SUM(case when b.FEEDBACK_EVENT_CD_1=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_OPEN_PERC,
SUM(case when b.FEEDBACK_EVENT_CD_2=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_CLICK_PERC,
SUM(case when b.FEEDBACK_EVENT_CD_5=1 then 1 else 0 end) OVER(partition by a.EMAIL_ADDRESS_ID ORDER BY a.EMAIL_ADDRESS_ID, a.SEQ_FREQ, b.SEQ_ENGAGEMENT rows unbounded preceding)/a.SEQ_FREQ CUM_UNSUB_PERC
from ANALYTICS_STG..CAMPAIGN_ADDRESS_3 a
left join ANALYTICS_STG..CAMPAIGN_ADDRESS_RESPONSE_1 b
on a.CAMPAIGN_ID=b.CAMPAIGN_ID
and a.CONCEPT_ID=b.CONCEPT_ID
and a.EMAIL_ADDRESS_ID = b.EMAIL_ADDRESS_ID
and a.SEQ_FREQ = b.SEQ_FREQ

;

SQL to Batchloading

sn <- "*********"
sql.con <- odbcConnect(dsn)
sql.db <- "*********"
advertiserid <- "4009"

rtg.facts <- unlist(sqlQuery(sql.con, paste("select distinct t.FactId
      from ", sql.db, ".dbo.Segment s with (nolock)
                                            inner join ", sql.db, ".dbo.SegmentTaxonomy st with (nolock)
                                            on s.SegmentId = st.SegmentId
                                            inner join ", sql.db, ".dbo.Taxonomy t with (nolock)
                                            on st.TaxonomyId = t.TaxonomyId
                                            where s.AdvertiserId = ", advertiserid, sep = "")))
str <- paste(rtg.facts, collapse= ",")
pfacts <- sqlQuery(sql.con, paste("exec ", sql.db, ".dbo**** ", advertiserid, sep = ""))
odbcClose(sql.con)

#save(str, file="str.rda",ascii=FALSE)
load(file = "str.rda")
load(file = "pfacts.rda")

pfacts <- data.frame(factid = as.integer(pfacts[,1]))
dsn <- "********"
con <- odbcConnect(dsn)
#sqlSave(con, pfacts1, tablename="Emily", rownames=FALSE, colnames= FALSE, append=TRUE, varTypes=varTypes)
odbcClose(con)

write.table(coef, file = "ling_coef.csv", row.names = F, col.names = F, sep = ",")

system("cd .\\SVM");
system("winscp.bat");

sqlQuery(con, "drop table lh_coef")
sqlQuery(con, "CREATE TABLE lh_coef (
                    Distance double precision,
                    center double precision,
                    scale double precision,
                    FactId int distkey
                    )
                    sortkey(factid)")
sqlQuery(con, "copy lh_coef from \'/home/coef.csv\' with delimiter ','")

SQL to Merge Imps and Convs


select count(*) from lh_imps;
select * from lh_imps limit 10;
--drop table lh_imps;
create table lh_imps as
select a.userhashkey, a.datekey,  a.revenue,
       c.adid, d.sectionid,
       to_timestamp(a.datekey * 1000000.0 + a.timekey - 1000000, 'YYYYMMDDHHMISS') as impdatetime
from databasefactevent a
join databasedimcampaign b using(campaignkey)
join databasedimad c using(adkey)
join databasedimsection d using(sectionkey)
where a.eventtypeid = 1
and b.campaignid = 57464
and a.datekey between 20120901 and 20120930
order by a.userhashkey, impdatetime;


select count(*) from lh_imps2;
select * from lh_imps2 limit 10;
--drop table lh_imps2;
create table lh_imps2 as
select a.*,
       row_number() over(partition by userhashkey order by userhashkey, impdatetime, adid, sectionid) frequency,
       sum(revenue) over(partition by userhashkey order by userhashkey, impdatetime, adid, sectionid) cum_revenue
from lh_imps a;

select count(*) from lh_conv;
select * from lh_conv limit 10;
--drop table lh_conv;
create table lh_conv as
select a.eventkey, a.datekey as convdate, a.userhashkey, c.adid, d.sectionid,
       to_timestamp(a.datekey * 1000000.0 + a.timekey - 1000000, 'YYYYMMDDHHMISS') as convdatetime
from databasefactconversion a
join databasedimcampaign b using(campaignkey)
join databasedimad c using(adkey)
join databasedimsection d using(sectionkey)
where b.campaignid = 57464
and a.datekey between 20120901 and 20120930;

select count(*) from lh_conv2;
select * from lh_conv2 limit 10;
--drop table lh_conv2;
create table lh_conv2 as
select b.eventkey, b.userhashkey, b.adid, b.sectionid, b.convdate,  b.convdatetime,
       max(a.impdatetime) as lastImpTime,
       max(a.datekey) as lastImpdatekey,
       max(a.frequency) as lastImpfrequency
from lh_imps2 a
join lh_conv b
on a.userhashkey = b.userhashkey and a.adid = b.adid and a.sectionid = b.sectionid
where a.impdatetime < b.convdatetime
group by 1, 2, 3, 4, 5, 6;

select count(*) from lh_conv3;
select count(*) from lh_conv3 where converted = 1;
select * from lh_conv3 where converted = 1 limit 10;
--drop table lh_conv3;
create table lh_conv3 as
select a.*,
       (case when b.userhashkey is not null then 1 else 0 end ) converted,
       (case when b.userhashkey is not null then b.convdate - a.datekey  else -1 end ) days2conv
from lh_imps2 a
left join lh_conv2 b
on a.userhashkey = b.userhashkey and a.adid = b.adid and a.sectionid = b.sectionid and a.impdatetime = b.lastImpTime;

select userhashkey, adid, sectionid, impdatetime, count(*)
from lh_conv3
where converted = 1
group by 1,2,3,4
having count(*)>2;

select count(*) from lh_conv4;
select count(*) from lh_conv4 where converted = 1;
select * from lh_conv4 where converted = 1 limit 10;
select * from lh_conv4 where converted = 0 limit 10;
--drop table lh_conv4;
create table lh_conv4 as
select a.*,
       (case when b.userhashkey is not null then 1 else 0 end ) converted,
       (case when b.userhashkey is not null then b.convdate - a.datekey  else -1 end ) days2conv
from lh_imps2 a
left join (
select userhashkey, adid, sectionid, max(convdate) as convdate,
       max(convdatetime) as convdatetime,
       max(lastImpTime) as lastImpTime,
       max(lastImpdatekey) as lastImpdatekey
from lh_conv2
group by 1,2,3
) b
on a.userhashkey = b.userhashkey and a.adid = b.adid and a.sectionid = b.sectionid and a.impdatetime = b.lastImpTime;