Tuesday, November 24, 2015

Similarity Calculation 2 - Cosine Similarity Using SQL and SAS

-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;

-- 1 Calculate positive marginal counts for x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
;

-- SAS
/*Calculate probabilities for positive*/
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>1.0;
quit;
%put &varlist;

-- 2 Calculate positive marginal counts given x=1 and y=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1
;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;
-- 1 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as
select 
b.UNIV_BBBUS/(sqrt(a.UNIV_BBBUS)*sqrt(4856590.0) ) UNIV_BBBUS,
b.UNIV_BABYUS/(sqrt(a.UNIV_BABYUS)*sqrt(4856590.0) ) UNIV_BABYUS,
b.UNIV_CTSUS/(sqrt(a.UNIV_CTSUS)*sqrt(4856590.0) ) UNIV_CTSUS,
b.UNIV_HRMUS/(sqrt(a.UNIV_HRMUS)*sqrt(4856590.0) ) UNIV_HRMUS,
b.UNIV_BBBCA/(sqrt(a.UNIV_BBBCA)*sqrt(4856590.0) ) UNIV_BBBCA,
b.UNIV_BBBMX/(sqrt(a.UNIV_BBBMX)*sqrt(4856590.0) ) UNIV_BBBMX,
b.UNIV_UNIV_BABYCA/(sqrt(a.UNIV_UNIV_BABYCA)*sqrt(4856590.0) ) UNIV_UNIV_BABYCA,
...
b.FST_STR_SHOP_NBR_651/(sqrt(a.FST_STR_SHOP_NBR_651)*sqrt(4856590.0) ) FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos b
;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos;
-- 38,710,468 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as
select a.COUPON_BARCODE, a.RESPONSE,
a.UNIV_BBBUS*b.UNIV_BBBUS+
a.UNIV_BABYUS*b.UNIV_BABYUS+
a.UNIV_CTSUS*b.UNIV_CTSUS+
a.UNIV_HRMUS*b.UNIV_HRMUS+
a.UNIV_BBBCA*b.UNIV_BBBCA+
a.UNIV_BBBMX*b.UNIV_BBBMX+
...
a.FST_STR_SHOP_NBR_651*b.FST_STR_SHOP_NBR_651 score
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos b
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos
) a
group by 1
order by 1 
;

No comments:

Post a Comment

Blog Archive