Sunday, November 29, 2015

Similarity Calculation 5 - Mutual Information Using SQL and SAS

I(X,Y) = H(X,Y) - H(X|Y) - H(Y|X) 
Intuitively, we can interpret the MI between X and Y as the reduction in uncertainty about X about observing Y, or by symmetry, the reduction in uncertainty about Y after observing X.


-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;


-- 1 Calculate positive marginal counts for x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
;

-- SAS
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>0.0;
quit;


-- 2 Calculate entropy for marginal attributes;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select 
-(UNIV_BBBUS/38710468)*ln(UNIV_BBBUS/38710468)-(1-UNIV_BBBUS/38710468)*ln(1-UNIV_BBBUS/38710468)UNIV_BBBUS,
-(UNIV_BABYUS/38710468)*ln(UNIV_BABYUS/38710468)-(1-UNIV_BABYUS/38710468)*ln(1-UNIV_BABYUS/38710468)UNIV_BABYUS,
-(UNIV_CTSUS/38710468)*ln(UNIV_CTSUS/38710468)-(1-UNIV_CTSUS/38710468)*ln(1-UNIV_CTSUS/38710468)UNIV_CTSUS,
-(UNIV_HRMUS/38710468)*ln(UNIV_HRMUS/38710468)-(1-UNIV_HRMUS/38710468)*ln(1-UNIV_HRMUS/38710468)UNIV_HRMUS,
-(UNIV_BBBCA/38710468)*ln(UNIV_BBBCA/38710468)-(1-UNIV_BBBCA/38710468)*ln(1-UNIV_BBBCA/38710468)UNIV_BBBCA,
-(UNIV_BBBMX/38710468)*ln(UNIV_BBBMX/38710468)-(1-UNIV_BBBMX/38710468)*ln(1-UNIV_BBBMX/38710468)UNIV_BBBMX,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;
-- 3 Caculate conditional prob for y=1;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as
select 
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1
;

-- SAS
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>1.0;
quit;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos;
-- 4 Caculate conditional prob for y=1;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as
select 
-(POSTAL_ZIP4_85205_7911/4856590)*ln(POSTAL_ZIP4_85205_7911/4856590)-(1-POSTAL_ZIP4_85205_7911/4856590)*ln(1-POSTAL_ZIP4_85205_7911/4856590)POSTAL_ZIP4_85205_7911,
-(POSTAL_ZIP4_90403_5704/4856590)*ln(POSTAL_ZIP4_90403_5704/4856590)-(1-POSTAL_ZIP4_90403_5704/4856590)*ln(1-POSTAL_ZIP4_90403_5704/4856590)POSTAL_ZIP4_90403_5704,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos;
-- 5 Caculate conditional prob for y=0;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos as
select 
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 0
;

-- SAS
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

proc sort data=meanout;
by col1;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>1 and col1<33853878;
quit;

drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos;
-- 6 Caculate conditional prob for y=0;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos as
select 
-(POSTAL_ZIP4_11004_1040/33853878)*ln(POSTAL_ZIP4_11004_1040/33853878)-(1-POSTAL_ZIP4_11004_1040/33853878)*ln(1-POSTAL_ZIP4_11004_1040/33853878)POSTAL_ZIP4_11004_1040,
-(POSTAL_ZIP4_11364_3015/33853878)*ln(POSTAL_ZIP4_11364_3015/33853878)-(1-POSTAL_ZIP4_11364_3015/33853878)*ln(1-POSTAL_ZIP4_11364_3015/33853878)POSTAL_ZIP4_11364_3015,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos;



drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy;
-- H(X) - P(Y=1)H(X|Y=1) - P(Y=0)H(X|Y=0)
-- 38,710,468 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy as
select 
a.POSTAL_ZIP4_85205_7911-(4856590.0/38710468.0)*b.POSTAL_ZIP4_85205_7911-(33853878.0/38710468.0)*c.POSTAL_ZIP4_85205_7911 POSTAL_ZIP4_85205_7911,
a.POSTAL_ZIP4_90403_5704-(4856590.0/38710468.0)*b.POSTAL_ZIP4_90403_5704-(33853878.0/38710468.0)*c.POSTAL_ZIP4_90403_5704 POSTAL_ZIP4_90403_5704,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos b
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos c;
;


-- Apply Mutual Information
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy;
-- 38,710,468 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy as
select a.COUPON_BARCODE, a.RESPONSE,
a.POSTAL_ZIP4_85205_7911*b.POSTAL_ZIP4_85205_7911+
a.POSTAL_ZIP4_90403_5704*b.POSTAL_ZIP4_90403_5704+
a.POSTAL_ZIP4_92173_3150*b.POSTAL_ZIP4_92173_3150+
a.POSTAL_ZIP4_90631_1103*b.POSTAL_ZIP4_90631_1103+
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy b
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy
) a
group by 1
order by 1 
;

No comments:

Post a Comment

Blog Archive