Sunday, November 29, 2015

Similarity Calculation 4 - Naive Bayes Using SQL and SAS

-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;


-- Calculate counts/prob for positive;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
sum(DM_RECENTOPT_OPT_BABYUS)DM_RECENTOPT_OPT_BABYUS,
sum(DM_RECENTOPT_OPT_BBBUS)DM_RECENTOPT_OPT_BBBUS,
sum(DM_RECENTOPT_OPT_CTSUS)DM_RECENTOPT_OPT_CTSUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1;


-- Caculate counts/prob for negative;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
sum(DM_RECENTOPT_OPT_BABYUS)DM_RECENTOPT_OPT_BABYUS,
sum(DM_RECENTOPT_OPT_BBBUS)DM_RECENTOPT_OPT_BBBUS,
sum(DM_RECENTOPT_OPT_CTSUS)DM_RECENTOPT_OPT_CTSUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 0;


-- Drop a probability of zero which causes divide by zero;
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql noprint;
select _name_ into :varlist separated by ' '
from meanout where col1>0.0;
quit;
%put &varlist;


/*Calculate probabilities for negative*/
proc sql;
create table neg as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg
;
)
;
quit;

proc means data=neg noprint;
var &varlist;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>0.0;
quit;


-- Caculate conditional prob ratio for positive;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select
(a.UNIV_BBBUS/4856590.0)/(b.UNIV_BBBUS/33853878.0) UNIV_BBBUS,
(a.UNIV_BABYUS/4856590.0)/(b.UNIV_BABYUS/33853878.0) UNIV_BABYUS,
(a.UNIV_CTSUS/4856590.0)/(b.UNIV_CTSUS/33853878.0) UNIV_CTSUS,
(a.UNIV_HRMUS/4856590.0)/(b.UNIV_HRMUS/33853878.0) UNIV_HRMUS,
(a.UNIV_BBBCA/4856590.0)/(b.UNIV_BBBCA/33853878.0) UNIV_BBBCA,
(a.UNIV_BBBMX/4856590.0)/(b.UNIV_BBBMX/33853878.0) UNIV_BBBMX,
(a.UNIV_UNIV_BABYCA/4856590.0)/(b.UNIV_UNIV_BABYCA/33853878.0) UNIV_UNIV_BABYCA,
(a.DM_RECENTOPT_OPT_BABYUS/4856590.0)/(b.DM_RECENTOPT_OPT_BABYUS/33853878.0) DM_RECENTOPT_OPT_BABYUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg b;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_neg;
-- Caculate conditional prob ratio for negative;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_neg as
select
(1-a.UNIV_BBBUS/4856590.0)/(1-b.UNIV_BBBUS/33853878.0) UNIV_BBBUS,
(1-a.UNIV_BABYUS/4856590.0)/(1-b.UNIV_BABYUS/33853878.0) UNIV_BABYUS,
(1-a.UNIV_CTSUS/4856590.0)/(1-b.UNIV_CTSUS/33853878.0) UNIV_CTSUS,
(1-a.UNIV_HRMUS/4856590.0)/(1-b.UNIV_HRMUS/33853878.0) UNIV_HRMUS,
(1-a.UNIV_BBBCA/4856590.0)/(1-b.UNIV_BBBCA/33853878.0) UNIV_BBBCA,
(1-a.UNIV_BBBMX/4856590.0)/(1-b.UNIV_BBBMX/33853878.0) UNIV_BBBMX,
(1-a.UNIV_UNIV_BABYCA/4856590.0)/(1-b.UNIV_UNIV_BABYCA/33853878.0) UNIV_UNIV_BABYCA,
(1-a.DM_RECENTOPT_OPT_BABYUS/4856590.0)/(1-b.DM_RECENTOPT_OPT_BABYUS/33853878.0) DM_RECENTOPT_OPT_BABYUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg b;


create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_NB as
select a.*,
a.UNIV_BBBUS*b.UNIV_BBBUS+(1-a.UNIV_BBBUS)*c.UNIV_BBBUS+
a.UNIV_BABYUS*b.UNIV_BABYUS+(1-a.UNIV_BABYUS)*c.UNIV_BABYUS+
a.UNIV_CTSUS*b.UNIV_CTSUS+(1-a.UNIV_CTSUS)*c.UNIV_CTSUS+
a.UNIV_HRMUS*b.UNIV_HRMUS+(1-a.UNIV_HRMUS)*c.UNIV_HRMUS+
a.UNIV_BBBCA*b.UNIV_BBBCA+(1-a.UNIV_BBBCA)*c.UNIV_BBBCA+
a.UNIV_BBBMX*b.UNIV_BBBMX+(1-a.UNIV_BBBMX)*c.UNIV_BBBMX+
a.UNIV_UNIV_BABYCA*b.UNIV_UNIV_BABYCA+(1-a.UNIV_UNIV_BABYCA)*c.UNIV_UNIV_BABYCA+
...
a.FST_STR_SHOP_NBR_651*b.FST_STR_SHOP_NBR_651+(1-a.FST_STR_SHOP_NBR_651)*c.FST_STR_SHOP_NBR_651 score
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos b
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_neg c
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_NB
) a
group by 1
order by 1
;

No comments:

Post a Comment

Blog Archive