Intuitively, we can interpret the MI between X and Y as the reduction in uncertainty about X about observing Y, or by symmetry, the reduction in uncertainty about Y after observing X.
-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468; select sum(response), count(1)-sum(response), count(1) from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1; -- 1 Calculate positive marginal counts for x=1; drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos; -- 1 rows affected; create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as select sum(UNIV_BBBUS)UNIV_BBBUS, sum(UNIV_BABYUS)UNIV_BABYUS, sum(UNIV_CTSUS)UNIV_CTSUS, sum(UNIV_HRMUS)UNIV_HRMUS, sum(UNIV_BBBCA)UNIV_BBBCA, sum(UNIV_BBBMX)UNIV_BBBMX, sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA, ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 ; -- SAS proc sql; create table pos as select * from connection to NETEZZA ( select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos ; ) ; quit; proc means data=pos noprint; output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN'))); run; proc transpose data=meanout out=meanout; run; * varlist; proc sql; select _name_ from meanout where col1>0.0; quit; -- 2 Calculate entropy for marginal attributes; drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos; -- 1 rows affected; create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as select -(UNIV_BBBUS/38710468)*ln(UNIV_BBBUS/38710468)-(1-UNIV_BBBUS/38710468)*ln(1-UNIV_BBBUS/38710468)UNIV_BBBUS, -(UNIV_BABYUS/38710468)*ln(UNIV_BABYUS/38710468)-(1-UNIV_BABYUS/38710468)*ln(1-UNIV_BABYUS/38710468)UNIV_BABYUS, -(UNIV_CTSUS/38710468)*ln(UNIV_CTSUS/38710468)-(1-UNIV_CTSUS/38710468)*ln(1-UNIV_CTSUS/38710468)UNIV_CTSUS, -(UNIV_HRMUS/38710468)*ln(UNIV_HRMUS/38710468)-(1-UNIV_HRMUS/38710468)*ln(1-UNIV_HRMUS/38710468)UNIV_HRMUS, -(UNIV_BBBCA/38710468)*ln(UNIV_BBBCA/38710468)-(1-UNIV_BBBCA/38710468)*ln(1-UNIV_BBBCA/38710468)UNIV_BBBCA, -(UNIV_BBBMX/38710468)*ln(UNIV_BBBMX/38710468)-(1-UNIV_BBBMX/38710468)*ln(1-UNIV_BBBMX/38710468)UNIV_BBBMX, ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a ; drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos; -- 3 Caculate conditional prob for y=1; create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as select sum(UNIV_BBBUS)UNIV_BBBUS, sum(UNIV_BABYUS)UNIV_BABYUS, sum(UNIV_CTSUS)UNIV_CTSUS, sum(UNIV_HRMUS)UNIV_HRMUS, sum(UNIV_BBBCA)UNIV_BBBCA, sum(UNIV_BBBMX)UNIV_BBBMX, sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA, ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 where response = 1 ; -- SAS proc sql; create table pos as select * from connection to NETEZZA ( select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos ; ) ; quit; proc means data=pos noprint; output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN'))); run; proc transpose data=meanout out=meanout; run; * varlist; proc sql; select _name_ from meanout where col1>1.0; quit; drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos; -- 4 Caculate conditional prob for y=1; create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as select -(POSTAL_ZIP4_85205_7911/4856590)*ln(POSTAL_ZIP4_85205_7911/4856590)-(1-POSTAL_ZIP4_85205_7911/4856590)*ln(1-POSTAL_ZIP4_85205_7911/4856590)POSTAL_ZIP4_85205_7911, -(POSTAL_ZIP4_90403_5704/4856590)*ln(POSTAL_ZIP4_90403_5704/4856590)-(1-POSTAL_ZIP4_90403_5704/4856590)*ln(1-POSTAL_ZIP4_90403_5704/4856590)POSTAL_ZIP4_90403_5704, ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos; drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos; -- 5 Caculate conditional prob for y=0; create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos as select sum(UNIV_BBBUS)UNIV_BBBUS, sum(UNIV_BABYUS)UNIV_BABYUS, sum(UNIV_CTSUS)UNIV_CTSUS, sum(UNIV_HRMUS)UNIV_HRMUS, sum(UNIV_BBBCA)UNIV_BBBCA, sum(UNIV_BBBMX)UNIV_BBBMX, sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA, ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 where response = 0 ; -- SAS proc sql; create table pos as select * from connection to NETEZZA ( select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos ; ) ; quit; proc means data=pos noprint; output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN'))); run; proc transpose data=meanout out=meanout; run; proc sort data=meanout; by col1; run; * varlist; proc sql; select _name_ from meanout where col1>1 and col1<33853878; quit; drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos; -- 6 Caculate conditional prob for y=0; create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos as select -(POSTAL_ZIP4_11004_1040/33853878)*ln(POSTAL_ZIP4_11004_1040/33853878)-(1-POSTAL_ZIP4_11004_1040/33853878)*ln(1-POSTAL_ZIP4_11004_1040/33853878)POSTAL_ZIP4_11004_1040, -(POSTAL_ZIP4_11364_3015/33853878)*ln(POSTAL_ZIP4_11364_3015/33853878)-(1-POSTAL_ZIP4_11364_3015/33853878)*ln(1-POSTAL_ZIP4_11364_3015/33853878)POSTAL_ZIP4_11364_3015, ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos; drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy; -- H(X) - P(Y=1)H(X|Y=1) - P(Y=0)H(X|Y=0) -- 38,710,468 rows affected create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy as select a.POSTAL_ZIP4_85205_7911-(4856590.0/38710468.0)*b.POSTAL_ZIP4_85205_7911-(33853878.0/38710468.0)*c.POSTAL_ZIP4_85205_7911 POSTAL_ZIP4_85205_7911, a.POSTAL_ZIP4_90403_5704-(4856590.0/38710468.0)*b.POSTAL_ZIP4_90403_5704-(33853878.0/38710468.0)*c.POSTAL_ZIP4_90403_5704 POSTAL_ZIP4_90403_5704, ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos a cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos b cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos c; ; -- Apply Mutual Information drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy; -- 38,710,468 rows affected create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy as select a.COUPON_BARCODE, a.RESPONSE, a.POSTAL_ZIP4_85205_7911*b.POSTAL_ZIP4_85205_7911+ a.POSTAL_ZIP4_90403_5704*b.POSTAL_ZIP4_90403_5704+ a.POSTAL_ZIP4_92173_3150*b.POSTAL_ZIP4_92173_3150+ a.POSTAL_ZIP4_90631_1103*b.POSTAL_ZIP4_90631_1103+ ... from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy b ; select a.DECILE, count(*), sum(response) from ( select coupon_barcode, response, score, NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy ) a group by 1 order by 1 ;
No comments:
Post a Comment