Tuesday, November 24, 2015

Similarity Calculation 1 - Jaccard Similarity Using SQL

-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;

-- 1 Calculate positive marginal counts for x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
;


-- 2 Calculate positive marginal counts given x=1 and y=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1
;

-- 3 Calculate Jaccard Similarity for y=1 & x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;
-- 1 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as
select 
b.UNIV_BBBUS/(a.UNIV_BBBUS+4856590-b.UNIV_BBBUS) UNIV_BBBUS,
b.UNIV_BABYUS/(a.UNIV_BABYUS+4856590-b.UNIV_BABYUS) UNIV_BABYUS,
b.UNIV_CTSUS/(a.UNIV_CTSUS+4856590-b.UNIV_CTSUS) UNIV_CTSUS,
b.UNIV_HRMUS/(a.UNIV_HRMUS+4856590-b.UNIV_HRMUS) UNIV_HRMUS,
b.UNIV_BBBCA/(a.UNIV_BBBCA+4856590-b.UNIV_BBBCA) UNIV_BBBCA,
b.UNIV_BBBMX/(a.UNIV_BBBMX+4856590-b.UNIV_BBBMX) UNIV_BBBMX,
...
b.FST_STR_SHOP_NBR_651/(a.FST_STR_SHOP_NBR_651+4856590-b.FST_STR_SHOP_NBR_651) FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos b
;

-- 4 Apply Jaccard Similarity to the data;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as
select a.COUPON_BARCODE, a.RESPONSE,
a.UNIV_BBBUS*b.UNIV_BBBUS+
a.UNIV_BABYUS*b.UNIV_BABYUS+
a.UNIV_CTSUS*b.UNIV_CTSUS+
a.UNIV_HRMUS*b.UNIV_HRMUS+
a.UNIV_BBBCA*b.UNIV_BBBCA+
a.UNIV_BBBMX*b.UNIV_BBBMX+
a.UNIV_UNIV_BABYCA*b.UNIV_UNIV_BABYCA+
...
a.FST_STR_SHOP_NBR_651*b.FST_STR_SHOP_NBR_651 score
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos b
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos
) a
group by 1
order by 1 
;

No comments:

Post a Comment

Blog Archive