Sunday, November 29, 2015

Similarity Calculation 5 - Mutual Information Using SQL and SAS

I(X,Y) = H(X,Y) - H(X|Y) - H(Y|X) 
Intuitively, we can interpret the MI between X and Y as the reduction in uncertainty about X about observing Y, or by symmetry, the reduction in uncertainty about Y after observing X.


-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;


-- 1 Calculate positive marginal counts for x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
;

-- SAS
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>0.0;
quit;


-- 2 Calculate entropy for marginal attributes;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select 
-(UNIV_BBBUS/38710468)*ln(UNIV_BBBUS/38710468)-(1-UNIV_BBBUS/38710468)*ln(1-UNIV_BBBUS/38710468)UNIV_BBBUS,
-(UNIV_BABYUS/38710468)*ln(UNIV_BABYUS/38710468)-(1-UNIV_BABYUS/38710468)*ln(1-UNIV_BABYUS/38710468)UNIV_BABYUS,
-(UNIV_CTSUS/38710468)*ln(UNIV_CTSUS/38710468)-(1-UNIV_CTSUS/38710468)*ln(1-UNIV_CTSUS/38710468)UNIV_CTSUS,
-(UNIV_HRMUS/38710468)*ln(UNIV_HRMUS/38710468)-(1-UNIV_HRMUS/38710468)*ln(1-UNIV_HRMUS/38710468)UNIV_HRMUS,
-(UNIV_BBBCA/38710468)*ln(UNIV_BBBCA/38710468)-(1-UNIV_BBBCA/38710468)*ln(1-UNIV_BBBCA/38710468)UNIV_BBBCA,
-(UNIV_BBBMX/38710468)*ln(UNIV_BBBMX/38710468)-(1-UNIV_BBBMX/38710468)*ln(1-UNIV_BBBMX/38710468)UNIV_BBBMX,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;
-- 3 Caculate conditional prob for y=1;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as
select 
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1
;

-- SAS
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>1.0;
quit;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos;
-- 4 Caculate conditional prob for y=1;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as
select 
-(POSTAL_ZIP4_85205_7911/4856590)*ln(POSTAL_ZIP4_85205_7911/4856590)-(1-POSTAL_ZIP4_85205_7911/4856590)*ln(1-POSTAL_ZIP4_85205_7911/4856590)POSTAL_ZIP4_85205_7911,
-(POSTAL_ZIP4_90403_5704/4856590)*ln(POSTAL_ZIP4_90403_5704/4856590)-(1-POSTAL_ZIP4_90403_5704/4856590)*ln(1-POSTAL_ZIP4_90403_5704/4856590)POSTAL_ZIP4_90403_5704,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos;
-- 5 Caculate conditional prob for y=0;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos as
select 
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 0
;

-- SAS
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

proc sort data=meanout;
by col1;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>1 and col1<33853878;
quit;

drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos;
-- 6 Caculate conditional prob for y=0;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos as
select 
-(POSTAL_ZIP4_11004_1040/33853878)*ln(POSTAL_ZIP4_11004_1040/33853878)-(1-POSTAL_ZIP4_11004_1040/33853878)*ln(1-POSTAL_ZIP4_11004_1040/33853878)POSTAL_ZIP4_11004_1040,
-(POSTAL_ZIP4_11364_3015/33853878)*ln(POSTAL_ZIP4_11364_3015/33853878)-(1-POSTAL_ZIP4_11364_3015/33853878)*ln(1-POSTAL_ZIP4_11364_3015/33853878)POSTAL_ZIP4_11364_3015,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM5_pos;



drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy;
-- H(X) - P(Y=1)H(X|Y=1) - P(Y=0)H(X|Y=0)
-- 38,710,468 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy as
select 
a.POSTAL_ZIP4_85205_7911-(4856590.0/38710468.0)*b.POSTAL_ZIP4_85205_7911-(33853878.0/38710468.0)*c.POSTAL_ZIP4_85205_7911 POSTAL_ZIP4_85205_7911,
a.POSTAL_ZIP4_90403_5704-(4856590.0/38710468.0)*b.POSTAL_ZIP4_90403_5704-(33853878.0/38710468.0)*c.POSTAL_ZIP4_90403_5704 POSTAL_ZIP4_90403_5704,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos b
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM6_pos c;
;


-- Apply Mutual Information
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy;
-- 38,710,468 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy as
select a.COUPON_BARCODE, a.RESPONSE,
a.POSTAL_ZIP4_85205_7911*b.POSTAL_ZIP4_85205_7911+
a.POSTAL_ZIP4_90403_5704*b.POSTAL_ZIP4_90403_5704+
a.POSTAL_ZIP4_92173_3150*b.POSTAL_ZIP4_92173_3150+
a.POSTAL_ZIP4_90631_1103*b.POSTAL_ZIP4_90631_1103+
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_Entropy b
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_Entropy
) a
group by 1
order by 1 
;

Similarity Calculation 4 - Naive Bayes Using SQL and SAS

-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;


-- Calculate counts/prob for positive;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
sum(DM_RECENTOPT_OPT_BABYUS)DM_RECENTOPT_OPT_BABYUS,
sum(DM_RECENTOPT_OPT_BBBUS)DM_RECENTOPT_OPT_BBBUS,
sum(DM_RECENTOPT_OPT_CTSUS)DM_RECENTOPT_OPT_CTSUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1;


-- Caculate counts/prob for negative;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
sum(DM_RECENTOPT_OPT_BABYUS)DM_RECENTOPT_OPT_BABYUS,
sum(DM_RECENTOPT_OPT_BBBUS)DM_RECENTOPT_OPT_BBBUS,
sum(DM_RECENTOPT_OPT_CTSUS)DM_RECENTOPT_OPT_CTSUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 0;


-- Drop a probability of zero which causes divide by zero;
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql noprint;
select _name_ into :varlist separated by ' '
from meanout where col1>0.0;
quit;
%put &varlist;


/*Calculate probabilities for negative*/
proc sql;
create table neg as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg
;
)
;
quit;

proc means data=neg noprint;
var &varlist;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>0.0;
quit;


-- Caculate conditional prob ratio for positive;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select
(a.UNIV_BBBUS/4856590.0)/(b.UNIV_BBBUS/33853878.0) UNIV_BBBUS,
(a.UNIV_BABYUS/4856590.0)/(b.UNIV_BABYUS/33853878.0) UNIV_BABYUS,
(a.UNIV_CTSUS/4856590.0)/(b.UNIV_CTSUS/33853878.0) UNIV_CTSUS,
(a.UNIV_HRMUS/4856590.0)/(b.UNIV_HRMUS/33853878.0) UNIV_HRMUS,
(a.UNIV_BBBCA/4856590.0)/(b.UNIV_BBBCA/33853878.0) UNIV_BBBCA,
(a.UNIV_BBBMX/4856590.0)/(b.UNIV_BBBMX/33853878.0) UNIV_BBBMX,
(a.UNIV_UNIV_BABYCA/4856590.0)/(b.UNIV_UNIV_BABYCA/33853878.0) UNIV_UNIV_BABYCA,
(a.DM_RECENTOPT_OPT_BABYUS/4856590.0)/(b.DM_RECENTOPT_OPT_BABYUS/33853878.0) DM_RECENTOPT_OPT_BABYUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg b;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_neg;
-- Caculate conditional prob ratio for negative;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_neg as
select
(1-a.UNIV_BBBUS/4856590.0)/(1-b.UNIV_BBBUS/33853878.0) UNIV_BBBUS,
(1-a.UNIV_BABYUS/4856590.0)/(1-b.UNIV_BABYUS/33853878.0) UNIV_BABYUS,
(1-a.UNIV_CTSUS/4856590.0)/(1-b.UNIV_CTSUS/33853878.0) UNIV_CTSUS,
(1-a.UNIV_HRMUS/4856590.0)/(1-b.UNIV_HRMUS/33853878.0) UNIV_HRMUS,
(1-a.UNIV_BBBCA/4856590.0)/(1-b.UNIV_BBBCA/33853878.0) UNIV_BBBCA,
(1-a.UNIV_BBBMX/4856590.0)/(1-b.UNIV_BBBMX/33853878.0) UNIV_BBBMX,
(1-a.UNIV_UNIV_BABYCA/4856590.0)/(1-b.UNIV_UNIV_BABYCA/33853878.0) UNIV_UNIV_BABYCA,
(1-a.DM_RECENTOPT_OPT_BABYUS/4856590.0)/(1-b.DM_RECENTOPT_OPT_BABYUS/33853878.0) DM_RECENTOPT_OPT_BABYUS,
...
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_neg b;


create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_NB as
select a.*,
a.UNIV_BBBUS*b.UNIV_BBBUS+(1-a.UNIV_BBBUS)*c.UNIV_BBBUS+
a.UNIV_BABYUS*b.UNIV_BABYUS+(1-a.UNIV_BABYUS)*c.UNIV_BABYUS+
a.UNIV_CTSUS*b.UNIV_CTSUS+(1-a.UNIV_CTSUS)*c.UNIV_CTSUS+
a.UNIV_HRMUS*b.UNIV_HRMUS+(1-a.UNIV_HRMUS)*c.UNIV_HRMUS+
a.UNIV_BBBCA*b.UNIV_BBBCA+(1-a.UNIV_BBBCA)*c.UNIV_BBBCA+
a.UNIV_BBBMX*b.UNIV_BBBMX+(1-a.UNIV_BBBMX)*c.UNIV_BBBMX+
a.UNIV_UNIV_BABYCA*b.UNIV_UNIV_BABYCA+(1-a.UNIV_UNIV_BABYCA)*c.UNIV_UNIV_BABYCA+
...
a.FST_STR_SHOP_NBR_651*b.FST_STR_SHOP_NBR_651+(1-a.FST_STR_SHOP_NBR_651)*c.FST_STR_SHOP_NBR_651 score
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos b
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_neg c
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM_NB
) a
group by 1
order by 1
;

Wednesday, November 25, 2015

Similarity Calculation 3 - Gini/Efficiency Similarity Using SQL

-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;

-- 1 Calculate positive marginal counts given x=1 and y=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1
;
-- 2 Calculate Gini Similarity;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;
-- 1 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as
select 
a.UNIV_BBBUS/38710468.0 UNIV_BBBUS,
a.UNIV_BABYUS/38710468.0 UNIV_BABYUS,
a.UNIV_CTSUS/38710468.0 UNIV_CTSUS,
a.UNIV_HRMUS/38710468.0 UNIV_HRMUS,
a.UNIV_BBBCA/38710468.0 UNIV_BBBCA,
a.UNIV_BBBMX/38710468.0 UNIV_BBBMX,
...
a.FST_STR_SHOP_NBR_651/38710468.0 FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos a;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos;
-- 38,710,468 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as
select a.COUPON_BARCODE, a.RESPONSE,
a.UNIV_BBBUS*b.UNIV_BBBUS+
a.UNIV_BABYUS*b.UNIV_BABYUS+
a.UNIV_CTSUS*b.UNIV_CTSUS+
a.UNIV_HRMUS*b.UNIV_HRMUS+
a.UNIV_BBBCA*b.UNIV_BBBCA+
a.UNIV_BBBMX*b.UNIV_BBBMX+
a.UNIV_UNIV_BABYCA*b.UNIV_UNIV_BABYCA+
...
a.FST_STR_SHOP_NBR_651*b.FST_STR_SHOP_NBR_651  score
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos b
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos
) a
group by 1
order by 1 
;

Tuesday, November 24, 2015

Similarity Calculation 2 - Cosine Similarity Using SQL and SAS

-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;

-- 1 Calculate positive marginal counts for x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
;

-- SAS
/*Calculate probabilities for positive*/
proc sql;
create table pos as
select * from connection to NETEZZA
(
select * from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos
;
)
;
quit;

proc means data=pos noprint;
output out=meanout(drop=_type_ _freq_ where=(_stat_ in ('MIN')));
run;

proc transpose data=meanout out=meanout;
run;

* varlist;
proc sql;
select _name_
from meanout where col1>1.0;
quit;
%put &varlist;

-- 2 Calculate positive marginal counts given x=1 and y=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1
;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;
-- 1 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as
select 
b.UNIV_BBBUS/(sqrt(a.UNIV_BBBUS)*sqrt(4856590.0) ) UNIV_BBBUS,
b.UNIV_BABYUS/(sqrt(a.UNIV_BABYUS)*sqrt(4856590.0) ) UNIV_BABYUS,
b.UNIV_CTSUS/(sqrt(a.UNIV_CTSUS)*sqrt(4856590.0) ) UNIV_CTSUS,
b.UNIV_HRMUS/(sqrt(a.UNIV_HRMUS)*sqrt(4856590.0) ) UNIV_HRMUS,
b.UNIV_BBBCA/(sqrt(a.UNIV_BBBCA)*sqrt(4856590.0) ) UNIV_BBBCA,
b.UNIV_BBBMX/(sqrt(a.UNIV_BBBMX)*sqrt(4856590.0) ) UNIV_BBBMX,
b.UNIV_UNIV_BABYCA/(sqrt(a.UNIV_UNIV_BABYCA)*sqrt(4856590.0) ) UNIV_UNIV_BABYCA,
...
b.FST_STR_SHOP_NBR_651/(sqrt(a.FST_STR_SHOP_NBR_651)*sqrt(4856590.0) ) FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos b
;


drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos;
-- 38,710,468 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as
select a.COUPON_BARCODE, a.RESPONSE,
a.UNIV_BBBUS*b.UNIV_BBBUS+
a.UNIV_BABYUS*b.UNIV_BABYUS+
a.UNIV_CTSUS*b.UNIV_CTSUS+
a.UNIV_HRMUS*b.UNIV_HRMUS+
a.UNIV_BBBCA*b.UNIV_BBBCA+
a.UNIV_BBBMX*b.UNIV_BBBMX+
...
a.FST_STR_SHOP_NBR_651*b.FST_STR_SHOP_NBR_651 score
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos b
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos
) a
group by 1
order by 1 
;

Similarity Calculation 1 - Jaccard Similarity Using SQL

-- Calculate population counts/prob;
-- 4,856,590 33,853,878 38,710,468;
select sum(response), count(1)-sum(response), count(1)
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1;

-- 1 Calculate positive marginal counts for x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
sum(UNIV_UNIV_BABYCA)UNIV_UNIV_BABYCA,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
;


-- 2 Calculate positive marginal counts given x=1 and y=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos;
-- 1 rows affected;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos as
select
sum(UNIV_BBBUS)UNIV_BBBUS,
sum(UNIV_BABYUS)UNIV_BABYUS,
sum(UNIV_CTSUS)UNIV_CTSUS,
sum(UNIV_HRMUS)UNIV_HRMUS,
sum(UNIV_BBBCA)UNIV_BBBCA,
sum(UNIV_BBBMX)UNIV_BBBMX,
...
sum(FST_STR_SHOP_NBR_651)FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1
where response = 1
;

-- 3 Calculate Jaccard Similarity for y=1 & x=1;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos;
-- 1 rows affected
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos as
select 
b.UNIV_BBBUS/(a.UNIV_BBBUS+4856590-b.UNIV_BBBUS) UNIV_BBBUS,
b.UNIV_BABYUS/(a.UNIV_BABYUS+4856590-b.UNIV_BABYUS) UNIV_BABYUS,
b.UNIV_CTSUS/(a.UNIV_CTSUS+4856590-b.UNIV_CTSUS) UNIV_CTSUS,
b.UNIV_HRMUS/(a.UNIV_HRMUS+4856590-b.UNIV_HRMUS) UNIV_HRMUS,
b.UNIV_BBBCA/(a.UNIV_BBBCA+4856590-b.UNIV_BBBCA) UNIV_BBBCA,
b.UNIV_BBBMX/(a.UNIV_BBBMX+4856590-b.UNIV_BBBMX) UNIV_BBBMX,
...
b.FST_STR_SHOP_NBR_651/(a.FST_STR_SHOP_NBR_651+4856590-b.FST_STR_SHOP_NBR_651) FST_STR_SHOP_NBR_651
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1_pos a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM2_pos b
;

-- 4 Apply Jaccard Similarity to the data;
drop table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos;
create table ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos as
select a.COUPON_BARCODE, a.RESPONSE,
a.UNIV_BBBUS*b.UNIV_BBBUS+
a.UNIV_BABYUS*b.UNIV_BABYUS+
a.UNIV_CTSUS*b.UNIV_CTSUS+
a.UNIV_HRMUS*b.UNIV_HRMUS+
a.UNIV_BBBCA*b.UNIV_BBBCA+
a.UNIV_BBBMX*b.UNIV_BBBMX+
a.UNIV_UNIV_BABYCA*b.UNIV_UNIV_BABYCA+
...
a.FST_STR_SHOP_NBR_651*b.FST_STR_SHOP_NBR_651 score
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM1 a
cross join ANALYTICS_STG..LH_CAMPAIGN_DM_ADM3_pos b
;

select a.DECILE, count(*), sum(response)
from
(
select coupon_barcode, response, score,
NTILE(10) OVER(ORDER BY score DESC NULLS LAST) as DECILE
from ANALYTICS_STG..LH_CAMPAIGN_DM_ADM4_pos
) a
group by 1
order by 1 
;

Sunday, November 22, 2015

R Basics 12 - Enviromnetnts, Frames and the Call Stack

Environments
1) R uses environments to store the name-object pariing between variable name nad the R object assigned to that variabel
(assign creates par: <-, <<-, assign())
2) They are implemented with hash tables;
3) Like functions, environments are "first class objects" in R. They can be created, passed as parameters and manipulated like any other R object.
4) Environments are hierarchically organised (each env. has a parent).
5) When a function is called, R creates a new environment and the function operates in that new environment. All local variables to the functions are found in that environment (aka frame).


Code example:
dictionary <- function() {
e <- new.env(parent=emptyenv())
keycheck <- function(key) stop if not(is.character(key) && length(key)==1)

hasKey <- function(key) { 
keyCheck(key)
exists(key, where=e, inherits=FALSE)
}

rmKey <- function(key) {
stopifnot( !misssing(key))
keyCheck(key)
rm(list=key, pos=e)
}

putObj <- function(key, obj=key) {
stopifnot( ! missing(key))
keyCheck(key)
if(is.null(obj)) return(rmObj(key))
assign(key, obj, envir=e)
}

getObj <- function(key) {
stopifnot( !missing(key))
keyCheck(key)
if (!hasKey(key)) return(NULL)
e[[key]] 
}

allKeys <- function() ls(e, all.names=TRUE)

allObjs <- function() eapply(e, ageObj, all.names=TRUE)

list(hasKey=hasKey, allKeys=allKeys, rmKey=rmKEy, getObj=getObj, putObj=putObj, allObjs = allObjs)
}

d <- dictionary();
sapply(LETTERS,d$putObj)
d$hasKEy('A')
d$allKeys()
d$allObjs()
d$getObj('A')
d$rmKey('A')
d$hasKey('A')

Code example explained
The above dictionary fucntin returns the list at the end of the function.
That list and the listed callable functions exist in the environment created when the dictionary fucntin was called. 
This use of functions and lexical scoping is a poor man's OOP-class-like mechanism. 
The function also creates an environment, which it uses for its hash table properties to save and retrieve key-value pairs.

Lexical and dynamic scoping
R is a lexically scoped languuage. 
Variables aare resolved in terms of the function in which they were written, then the fucntin in which that function was written, 
all they way back to the top-level global/package environment where the program was written.
Variables are not resoved in terms of the functions that called them when the program is running dynamic scoping.
Interrogating the fucntion call stack allows R to simulate dynamic scoping.

Frames and environments
A frame is an environment plus a ssystem reference to calling frame.
R creates each frame to operate within ( starting withe the global environment, then a new dframe with each fucntion call).
All frames have associated environments, but you can create environments taht are not associated with the call stack.

The call stack
As a fucntion calls a new fucntion, a stack of calling frames is built up. This call stack can be interrogated dynamically.
sys.fame()
parent.frame()
parent.frame(1)
parent.frame(2)
sys.nframe()
sys.call()
sys.call(-1)
sys.call(1)
deparse(sys.call())[[1]]
parent.env(sys.frame())
Sys.getenv()
Sys.setenv()

Friday, November 20, 2015

R Basics 11 - OOP and R5

What is object oriented programming?
While definitions for OOP abound without clear agreement, OOP languages typically focus programers on the actors/objects(none) of a problem 
rather than the actions/procedures(verbs), by using a common set of language features, including:
1) Encapsulation data and code - the data and the code that manages that data are kept together by the language( in classes, models or clusters, etc.) Implicitly, this includes the notion of class definitions and class instances.
2) Information hiding - an exposed API with a hidden implementation of code and data; encourages programming by contract
3) Abstraction and Inheritance - so that similarities and difference in the underlying model/data/code/logic for related objects can be grouped  & reused
4) Dynamic dispatch - more than one method with the same name - where the method used is selected at compile or run-time by the class of the object and also the class of the method parameter types and their arity (argument number).

Four R Mechanisms with some OOP features
1) Lexical scoping 
- simple 
- encapsulation 
- mutability 
- information hiding, BUT not real classes 
- no inheritance
2) S3 classes 
- multiple dispatch on class only 
- inheritance, BUT just a naming convention
- no encapsulation 
- no information hiding 
- no control over use 
- no consistency checks 
- easy to abuse
3) S4 formal classes 
- multiple inheritance 
- multiple dispatch 
- inheritance 
- type checking, BUT no information hiding 
- verbose and complex to code 
- lots of new terms 
- immutable classes only
4) R5 reference classes 
- built on S4 
- mutable ( more like Java, C++) 
- type checking 
- multiple inheritance, BUT no information hiding 
- inconsistent with R's functional programming heritage
Note: None of R's OOP systems are as full featured or as robust as Java or C++.

What are S3 classes
# An S3 class is any R object to which a class attribute has been attached.

S3 classes - key functions
class(x)
class(x) <-'name'
methods('method')
UseMethod('method', x)
NextMethod()

Class code example
c.list <- list(hrs=12, mins=0, diem='am')
clas(c.list)
class(c.list)
clock <- structure(list(hrs=12, mins=0, diem="am"), .Names= c("hrs", "mins", "diem"), class = "clock")
c.list <- unclass(c.list)
attr(c.list, 'class')

Dynamic dispatch - UseMethod()
# the UseMehod for print already exists:
# print <- function(x) UseMehod('print', x)
print.clock <- function(x) {
cat(x$hrs);
cat(':')
cat(sprintf('%02d', x$mins));
cat(' ');
cat(x$diem);
cat('\n');
}
print(c.list)
methods('print')

Inheritance dispatch - NExtMethod()
# S3 classes allow for a limited form of class inheirtance fo the purposes of method dispatch. Try the following code:
sound <- function(x) UseMehod('sound', x)
sound.animal <- function(x) NextMethod()
sound.human <- function(x) 'conversation'
sound.cat <- function(x) 'meow'
sound.default <- function(x) 'grunt'
Cathy <- list(legs=4)
class(Cathy) <- c('animal', 'cat')
Harry <- list(legs=4)
class(Leroy) <- c('animal', 'llama')
sound(Cathy)
sound(Harry)
sound(Leroy)

Should I use S3 or S4 or R5?
S3: for amall/medium projects;
S4 for larger;
R5 if mutability is necessary

R5 Reference Classes

Summary of some key class mechanisms
1) create/get object-generator:
gen <- setRefClass('name', fields=, contains=, methods=, where,...)
enf <- getRefClass('name') - generator
gen$lock('fieldName') - lock a field
gen$help(topic) - get help on the class
gen$methods(...) - add methods to class
gen$methods() - get a list of methods
gen$fileds() - get a list of fields
gen$accessors(...) - create get/set fns

2) generator object used to get instance:
inst <- gen$new(...) - instantiation parameters passed to initialize(...)
inst$copy(shallow=F) - copy instance
inst$show() - called by print
inst$field(name, value) - set
inst$field(name) - get
is(inst 'envReflass') - is R5 test
[envRefClass is the super class for R5]

3) code from within your methods
initialize(...) instance initializer
finalize() - called by garbage collector
.self -reference to the self instance
.refClassDef -the class definition
methods::show() - call the show function
callSuper(...) - call the same method in the super class
.self$classVariable <- localVariable
classVariable <<- local Variable
globalVariable <<- localVariable
.self$classVariable <- localVariable
.self$field(clasVar, localVar) # set
.self$field(classVar, localVar) # get
Trap: very easy to confuse <- and <<-
Trap: if x is not a class field; x<<- var assigns to x in global environment

Field list - code sample
A <- setRefClas('A', 
fields = list(
# 1. typed, instance filed:
exampleVar1 = 'character',
# 2. instance file with accessor:
ev2.private='character',
exampleVar2 = function(x) {
if(!missing(x)) ev2.private <<- x
ev2.private
}
),methods = list(
initialize=function(c='default') {
exampleVar1 <<- c
exampleVar2 <<- c
}
)
)
instA <_ A$new('instnce of A');
str(instA)

Inheritance code sample
Animal <- setRefClass('Animal', 
# virtual super class
contains = list('VIRTUAL'),
fields = list(i.am = 'character', noiseMakes = 'character'),
methods = list(initialize=function(i.am='unknown', noiseMakes = 'unknown') {
.self$i.am <-i.am
.self$noiseMakes <- noiseMakes
},
show=function(){
cat('I am a');
cat(i.am)
cat('. I make this noise:')
cat(noiseMakes);
cat('\n')
}
)
)

Cat <- setRefCalss('Cat', 
contains = list('Aninmal'), 
methods = list(
initialize=function() callSuper('cat', 'meow'),
finalize=function() cat('Another cat passes.\n')
)
)

Dog <- setRefClass('Dog',
contains = list('Animal'),
methods = list(
initialize=function() callSuper('dog', 'woof'),
show = function() {
callSuper()
cat('I like to chew shoes. \n')
}
)
)

mongrel <- Animal$new()
fido = Dog$new();
felix =Cat$new()
print(fido);
print(felix)
felix <- NULL
gc()

What's neither C++ nor Java
1) No information hideing. Everthing is public and modifiable. (But the R package mechanism helps here).
2) No static class fields.
3) Not as developed or robust OOP space.

Tips(safer coding practices) and Traps
1) use named filed list to type variables
2) use accessor methods in the filed list to maintain class type & state validity
3) Trap:  methodName <- function() in methods list. Use = ( it's a named list!)
4) Trap: cant use enclosing environments within R5 classes as they are in one).

Wednesday, November 11, 2015

R Basics 10 - Avoiding For-Loops

What is wrong with using for-loops? 
Nothing! R's (for-while-repeat) loops are intuitive, and easy to code an maintain. Some tasks are best managed within loops. 

So why discourage the use of for-loops? 
1) Side effects and detritus from inline code. Replacing a loop with a function call means that what happened in the function stayed in the function. 
2) In some cases increased speed (especially so with nested loops and from poor loop-coding practice). 

How to make the paradigm shift? 
1) Use R's vectorization features. 
2) See if object indexing and subset assignment can replace the for-loop. 
3) If not, find an "apply" function that slices your object the way you need. 
4) Find (or write ) a function to do what you would have done in the body of the for-loop. Anonymous functions can be very useful for this task. 
5) if all else fails: move as much code as possible outside of the loop body 

Play data (for the examples following) 
requires('zoo') 
require('plyr') 
n <- 100 
u <- 1 
v <- rnow(n, 10, 10) + 1:n 
w <- round(runif(n, 0.6, 9.4)) 
df <- data.frame(month=u, x=u, y=v, z=w) 
l <- list(x = u, y = v, z = w, yz = v*w, xyz = u*v*w) 
trivial.add <= function(a, b) { a+b } 

Use R's vectorization features 
tot <- sum(log(u)) 
tot <- 0 for(i in seq_along(u)){ tot <- tot + log(u([i])) } 

Clever indexing and subset assignment 
df[df$z == 5, 'y'] <- -1 

The base apply family of functions
# l stands for list
# s stands for array
# d stands for data.frame
# t stands for array
# m is a special input type, which means that we provide multiple arguments in atabular format for the function
# r input type expects an integer, which specifies the number of times replicated
#_ is a special output type that does not return anything for the function
apply(x, margin, fun, ...) 
lapply(x, fun, ...) 
sapply(x, fun, ...) 
vapply(x, fun, fun.value, ...) 
tapply(x, index, fun = NULL, ...) 
mapply(fun, .., moreargs = NULL) 
eapply(env, fun, ...) 
replicate(n, expr, simplify = "array") 
by(data, indices, fun, ...) 
aggregated(x, by, fun, ...) 
rapply()

apply(by row/column on two+ dim object) 
# Object: m, t,df, a (has 2+ dimensions) 
# Returns: v, l, m (depends on input & fn) 
column.mean <- apply(df, 2, mean) 
row.product <- apply(df, 1, prod) 

lapply (on vecotr or list, return list) 
lapply(l, mean) 
unlist(lapply(u, trivial.add, 5)) 

sapply ( a simplified lapply on v or l) 
# object: v, l; # Returns: usually a vector 
sapply(l, mean) 
sapply(u, function(a) a*a) 
sapply(u, trivial.add, -1) 

Using sapply and lapply work in a similar way, traversing over a set of data like a list or vector, and calling the specified function for each item.

Sometimes we require traversal of our data in a less than linear way. Say we wanted to compare the current observation with the value 5 periods before it. Use can probably use rollapply for this (via quantmod), but a quick and dirty way is to run sapply or lapply passing a set of index values.

Here we will use sapply, which works on a list or vector of data.

sapply(1:3, function(x) x^2)
#[1] 1 4 9

lapply is very similar, however it will return a list rather than a vector:

lapply(1:3, function(x) x^2)
#[[1]]
#[1] 1
#
#[[2]]
#[1] 4
#
#[[3]]
#[1] 9

Passing simplify=FALSE to sapply will also give you a list:

sapply(1:3, function(x) x^2, simplify=F)
#[[1]]
#[1] 1
#
#[[2]]
#[1] 4
#
#[[3]]
#[1] 9

And you can use unlist with lapply to get a vector.

unlist(lapply(1:3, function(x) x^2))
#[1] 1 4 9

tapply ( group v/l by factor & apply fn) 
count.table <- tapply(v, w, length) 
min.1 <- with(df, tapply(y, z, min)) 

by (on l or v, returns "by" objects) 
min.2 <- by(df$y, df$z, min) 
min.3 <- by(df[, c('x', 'y'), df$z, min) 
# last one: finds min from two columns 

aggregte 
ag <- aggregate(df, by=list(df$z), mean) 
aggregate(df, by=list(w, 1+(u%%12)), mean) 
# Trap: variables must be in a list 

rollapply - from the zoo package 
# A 5-term, centred, rolling average 
v.maz <- rollapply(v, 5, mean, fill = NA) 
# Sum 3 months data for a quarterly total 
v.qtrly <- rollapply(v, 3, sum, fill=NA, align='right') 
# Note: zoo has rollmean(), rollmax() and rollmedian() functions 

inside a data.frame 
# Use transform() or within() to apply a function to a column in a data.frame 
df <- within(df, v.qtryly <- rollapply(v, 3,sum, fill=NA, align='right')) 
# use with() to simplify column access 

The plyr package 
Plyr is a fantastic family of apply like functions with a common naming system for the input-to and output-from split-apply-combine procedures. I use ddply() the most. 
ddply(df, .(x), summaise, min=min(y), max=max(y)) 
ddply(df, .(x), transform, span = x- y) 

Other packages worth looking at 3 
# foreach - a set of apply-like fns 
# snow - parallelised apply-like fns 
# snowfall - a usability wrapper for snow 

Abbreviation
v=vector l=list m=matrix df=data.frame a=array t=table f=factor d=dates


R Basics 9 - Writing Functions

Functions in R are called closures.
# Don't be deceived by the curly brackets;
# R is much more like Lisp than C or Java.
# Defining problems in the terms of function
# calls and their lazy, delayed evaluation
# (variable resolution) is R's big feature.

Standard form (for named functions)
plus <- function(x, y) {x+y}
plus(5,6)
# return() not needed - last value returned
# Optional curly brackets with 1-line fns:
x.to.y <- function(x,y) return (x^y)

Returning values
# return() - can use to aid readability and fro exit part way trhrough a function
# invisible() - return values thant do not print if not assigned.
# Traps: return() is a function, not a statement. The brackets are needed.

Anonymous fucntions
# Often used in arguments to fucntions:
v <- 1:9;
cube <- sapply(v, function(x) x^3)

Arguments are passed by value
# Effectively arguments are copied, and any changes made to the argument within the function do not affect the caller's copy.
# Trap: arguments are not typed and your function could be passed anything!
# Upfront argument checking advised!

Arguments passed by position or name
b <- function(cat, dog, cow) cat+dog+cow
b(1,2,3)
b(cow=3, cat =1, dog=2)
# Trap: not all arguments need to passed
f <- funciton(x) missing(x); f(); f('here')
# match.arg() - argument partial matching

Default arguments
# Default arguments can be specified 
x2y.1 <- function(x, y=2) {x^y}
x2y.2 <- function(x, y=x) {x^y}
x2y.2(3)
x2y.2(2,3)

The dots argument (...) is a catch - all
f <- function(...) {
# simle way to access dots arguments
dots <- list()
}
x <- f(5);
dput(x)
g <- function(...) {
dots <- substitute(list(,,,))[-1]
dots.names <- sapply(dots, deparse)
}

x <- g(a,b,c)
dput(x) -> c("a", "b", "c")
# dots can be passed to another function:
h <- function(x, ...) g(...)
x <- h(a, b, c);

Function environment
# When a function is called a new environment (frame) is created for it.
# There frames are found in the call stack. Fist frame is the global environment 
# Next Function reaches back into the call stack.
called.by <- function() {
# returns string 
if(length(sys.parents()) <=2) return('.GlobalEnv')
deparse(sys.call(sys.parent(2)))
}
g <- function(...) { called.by() }
f <- fucntion(...) g(...);
f(a,2)

Variable scope and unbound variables
# Within a function, variables are resoved in the local frame first, 
# then in terms of super-functions (when a functions defined inside a function), then in terms of the global environment.
h <- fucntion(x) { x+a }
a <- 5
h(5)
k <- function(x) { a<- 100; h(x) }
k(10)

Super assignment
# x <<- y ignores the local x, and looks up the super-environments for a x to replace 
accumulator <- fucntion() {
a <- 0
function(x) {
a <<- a +x
 }
}
acc <- accumulator()
acc(1)
acc(5)
acc(2)

Operator and replacement functions
`+`(4,5) # -> 9 
- operators are just fns
`%plus%` <- function(a,b) {a+b}
# "FUN(x) <- v is parsed as: x <- FUN(x, v)
"cap<-" <- function(x, value) 
ifelse(x>value, value, x)
x <- c(1,10,100);
cap(x) <- 9

Exeptions
tryCatch(print('pass'), error=fucntion(e) print('bad'), finally=print('done'))
tryCatch(stop('fail'), error=function(e) print('bad'), finally=print('done'))

Useful language reflection functions
exists(); 
get();
assign() - for variabels
substitute()
bquote()
eval()
do.call()
parse()
deparse()
quote()
enquote()

Thursday, November 5, 2015

R Basics 8 - Tips and Traps

General
Trap: R error message are not helpful
Tip: use traceback() to understand errors

Object coercion
Trap: R objects are often silently coerced to another class/type as/when needed.
Examples: 
c(1, TURE)
Tip: inspect objects with 
str(x)
model(x)
class(x)
typeof(x)
dput(x)
attributeds(x)

Factors(special case of coercion)
Trap: Factors cause more bug-hunting grief than just about anything else in R, especially when strig and integer vectors and data.frame cols are coerced to factor.
Tip: learn about factor and using them
Tip: explicitly test with is.factor(df$col)
Tip: use stringsAsFactors=FALSE argument when you create a data frame from file

Trap: maths doesn't work on numeric factors and they are tricky to convert back.
Tip: try as.numeric(as.character(factor))

Trap: appending rows to a data frame with factor columns is tricky.
Tip: make sure the row to be append is a presented to rbind() as a data.frame, and not as a vector or a list (which works sometimes)

Trap: the combine function c() will let you combine different factors into a vecotr of integer codes (probably garbage).
Tip: convert factors to string or integers (as appropriate) before combining.

Garbage in the workspace
Trap: R saves your workspace at the end of each session and reloads the saved workspace at the start of the next session.
Before you know it, you can have heaps of variables lurking in your workspace that are impacting on your calculations.
Tip: use ls() to check on lurking variables
Tip: clean up with rm(list=ls(all=TRUE))
Tip: library() to check on loaded packages
Tip: avoid savign workspaces, start R with the --no-save -- no-restore arguments

The 1:0 sequence in for-loops
Trap: for(x in 1:length(y)) fails on the zero length vector. IT will loop twice: first setting x to 1, then to 0.
Tip: use for(s in seq_len(y))
Trap: for ( x in y)
Tip: for x( in seq_along(y))

Space out your code and use brackets
Trap: x<-5
Tip: x<- -5
Trap: 1:n-1
Tip: 1:(n-1)
Trap: 2^2:9
Tip: 2^(2:9)

Vectors and vector recycling
Trap: most objects in R are vectors. R does not have scalars (jsut length=1 vectors).
Many Fns work on entire vectors at one.

Tip: In R, for-loop are often the inefficient and inelegant solution. Take the time to learn the various 'apply' family of functions and plyr package. 
Trap: Math with different length vectors will work with the shortest vector recycled
c(1,2,3) + c(10,20)

Vectors need the c() operator
wrong: mean(1,2,3)
correct: mean(c(1,2,3))

Use the correct Boolean operator
Tip: | and & are vectorise - use ifelse() (| and & also used with indexes to subset)
Tip: || and && are not vectorised - use if
Trap: || && lazy evaluation, | and & full evaluation
Trap: == (Boolean equality) = (assignment)

Equality testing with numbers
Trap: == and != test for near in/equality
as.double(8) = as.integer(8)
isTRUE(all.equal(x,y)) tests near equality
Tip: identical(x,y) is more fussy

Think hard about NA, NaN and NULL
Trap: NA and NaN are valid values.
Trap: many Fns fail by default on NA input
Tip: many functions take na.rm=TRUE
Tip: vector test for NA
Trap: x == NA is not the same as is.na(x)
Trap: x == NULL not he same as is.null(x)
Trap: is.numeic(NaN) returns TRUE

Indexing ([], [[]]. $)
Tip: Objects are indexed from 1 to N.
Trap: many subtle differences in indexing for vectors, lists, matrices, arrays and data.frames.
Return types vary depending on object being indexed and idexation method.
Tip: take the time to learn the differences
Trap: the zero-index fails silently
Trap: negative indexes return all but those
Trap: NA is a valid Boolean index
Trap: mismatched Boolean indexes work

Coding Practice
Tip: liberally use stopifnot() on function entry to verify argument validity
Tip: <- for assignment; = for list names

R Basics 7 - Factors

Factors
- A one-dimensional array of categorical (unordered) or ordinal (ordered) data.
- Indexed from 1 to N. Not fixed length.
- Named factors are possible (but rare).
- The hidden/unexpected coercion of object of a factor is a key source of bugs.

Why use Factors
- Specifying a non-alphabetical order
- Some statistical functions treat cat/ordinal data differently from continuous data.
- Deep ggplot2 code depends on it

Create 
Example 1 - unordered
> sex.v <- c('M', 'F', 'F', 'M', 'F'); sex.v
[1] "M" "F" "F" "M" "F"
> sex.f <- factor(sex.v); sex.f
[1] M F F M F
Levels: F M
> sex.w <- as.character(sex.f); sex.w
[1] "M" "F" "F" "M" "F"

Example 2 - ordered (small, medium, large)
> size.v <- c('S','L', 'M', 'L', 'S', 'M'); size.v
[1] "S" "L" "M" "L" "S" "M"
> size1.f <- factor(size.v, ordered = TRUE); size1.f
[1] S L M L S M
Levels: L < M < S
Example 3 - ordered, where we set the order
> size.lvls <- c('S','M','L')
> size2.f <- factor(size.v, levels=size.lvls); size2.f
[1] S L M L S M
Levels: S M L
 
Example 4 - ordered with levels and labels
> levels <- c(1,2,3,99)
> labels <- c('love', 'neutral', 'hate', NA); labels
[1] "love"    "neutral" "hate"    NA       
> data.v <- c(1,2,3,99,1,2,1,2,99); data.v
[1]  1  2  3 99  1  2  1  2 99
> data.f <- factor(data.v, levels=levels, labels=labels); data.f
[1] love    neutral hate    <NA>    love    neutral love    neutral <NA>   
Levels: love neutral hate <NA>
Example 5 - using the cut function to group
> i <- 1:50 + rnorm(50,0,5)
> k <- cut(i,5); k
 [1] (-0.859,10.2] (-0.859,10.2] (-0.859,10.2] (-0.859,10.2] (-0.859,10.2] (-0.859,10.2] (-0.859,10.2] (10.2,21.1]  
 [9] (-0.859,10.2] (10.2,21.1]   (-0.859,10.2] (-0.859,10.2] (-0.859,10.2] (10.2,21.1]   (-0.859,10.2] (10.2,21.1]  
[17] (10.2,21.1]   (21.1,32.1]   (21.1,32.1]   (10.2,21.1]   (-0.859,10.2] (10.2,21.1]   (10.2,21.1]   (21.1,32.1]  
[25] (21.1,32.1]   (10.2,21.1]   (32.1,43]     (21.1,32.1]   (21.1,32.1]   (21.1,32.1]   (32.1,43]     (32.1,43]    
[33] (32.1,43]     (21.1,32.1]   (21.1,32.1]   (32.1,43]     (32.1,43]     (43,54.1]     (32.1,43]     (43,54.1]    
[41] (32.1,43]     (32.1,43]     (32.1,43]     (43,54.1]     (43,54.1]     (43,54.1]     (43,54.1]     (43,54.1]    
[49] (43,54.1]     (43,54.1]    
Levels: (-0.859,10.2] (10.2,21.1] (21.1,32.1] (32.1,43] (43,54.1]
> 

Basic information about a factor
> dim(f)
NULL
> is.factor(f)
[1] TRUE
> is.atomic(f)
[1] TRUE
> is.vector(f)
[1] FALSE
> is.list(f)
[1] FALSE
> is.recursive(f)
[1] FALSE
> length(f)
[1] 24
> names(f)
NULL
> mode(f)
[1] "numeric"
> class(f)
[1] "factor"
> typeof(f)
[1] "integer"
> is.ordered(f)
[1] FALSE
> unclass(f)
 [1] 4 3 2 1 4 3 2 1 4 3 2 1 4 3 2 1 4 3 2 1 4 3 2 1
attr(,"levels")
[1] "4" "3" "2" "1"
> cat(f)
4 3 2 1 4 3 2 1 4 3 2 1 4 3 2 1 4 3 2 1 4 3 2 1
> print(f)
 [1] 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4
Levels: 4 3 2 1
> str(f)
 Factor w/ 4 levels "4","3","2","1": 4 3 2 1 4 3 2 1 4 3 ...
> dput(f)
structure(c(4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L, 4L, 
3L, 2L, 1L, 4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L), .Label = c("4", 
"3", "2", "1"), class = "factor")
> head(f)
[1] 1 2 3 4 1 2
Levels: 4 3 2 1

Indexing: much like atomic vectors
- [x] selects a factor for the cell/range x
- [[x]] selects a length=1 factor fro the single cell index x (rarely used)
- The $ operator is invalid with actors

Factor arithmetic & Boolean comparisons
- factors cannot be added, multiple, etc.
- same-type factors are equality testable
> x <- sex.f[1] == sex.f[2];x
[1] FALSE
-- order factors can be order compared
> z <- size1.f[1] < size1.f[2]; z
[1] FALSE
Managing the enumeration (levels) > f <- factor(letters[1:3]);f [1] a b c Levels: a b c > levels(f) [1] "a" "b" "c" > levels(f)[1] [1] "a" > any(levels(f) %in% c('a', 'b', 'c')) [1] TRUE # add new levels > levels(f)[length(levels(f))+1] <-'z'; f [1] a b c Levels: a b c z AA > levels(f) <- c(levels(f), 'AA');f [1] a b c Levels: a b c z AA # reorder levels > f <- factor(f, levels(f)[c(4,1:3,5)]);f [1] xx b c Levels: c z xx b BB # change/rename levels > levels(f)[1] <- 'xx';f [1] xx b c Levels: xx b c z BB > levels(f)[levels(f) %in% 'AA'] <- 'BB';f [1] xx b c Levels: xx b c z BB # delete(drop) unused levels > f <- f[drop=TRUE] > f [1] xx b c Levels: c xx b Adding an element to a factor > f <- factor(letters[1:10]); f [1] a b c d e f g h i j Levels: a b c d e f g h i j > f[length(f) + 1] <- 'a'; f [1] a b c d e f g h i j a Levels: a b c d e f g h i j > f <- factor(c(as.character(f), 'zz')); f [1] a b c d e f g h i j a zz Levels: a b c d e f g h i j zz Merging/combining factors > a <- factor(1:10);a [1] 1 2 3 4 5 6 7 8 9 10 Levels: 1 2 3 4 5 6 7 8 9 10 > b <- factor(letters[a]);b [1] a b c d e f g h i j Levels: a b c d e f g h i j > union <- factor(c(as.character(a), as.character(b))); union [1] 1 2 3 4 5 6 7 8 9 10 a b c d e f g h i j Levels: 1 10 2 3 4 5 6 7 8 9 a b c d e f g h i j > cross <- interaction(a,b); cross [1] 1.a 2.b 3.c 4.d 5.e 6.f 7.g 8.h 9.i 10.j 100 Levels: 1.a 2.a 3.a 4.a 5.a 6.a 7.a 8.a 9.a 10.a 1.b 2.b 3.b 4.b 5.b 6.b 7.b 8.b 9.b 10.b 1.c 2.c 3.c 4.c 5.c ... 10.j
Using factors within data frames
df$x <- reorder(df$f, df$x, F, order=T)
by(df$x, df$f, F)
Traps
1 Strings loaded from a file converted to factors (read.table or read.csv stringASFactors=FALSE)
2 Numbers from a file factorised. as.numeric(levels(f))[as.integer(f)]
3 One factor (enumeration) cannot be meaningfully compared with another
4 NA's missing data in factors and levels can cause problems
5 Adding a row to a data frame, which adds a new level to a column factor.

R Basics 6 - Matrices and Arrays

Context

Matices and arrays are an extension on R's atomic vecotrs.
Atomic vectors contain values (not objects).
They hold a contiguous selt of values, all of which are the same basic type. There are six types of atomic vecotr: logical, integer, numeric, complex, caracter and raw.

Importantly: atomic vectors have no dimension attribute.
Matrices and arrays are effectively vectors with a dimension attribute. 
Matrices are two-dimensional(tabular) objects, containing values all of the same type (unlike data frames).
Arrays are multi-dimensional objects(typically with three plus dimensions), with values all of the same type.

Matrix versus data.frame
In a matrix, every column, and every cell is of the same basic atomic type. 
In a data.frame each column can be of a different type(eg. numeric, character, factor). Data frames are best with messy data, and for variables of mixed models.

Matrix creation
## generalCase <- matrix(data=NA, nrow=1, ncol=1, byrow=FALSE, dimnames=NULL)
> M <- matrix(c(2,1,3,4,5,6), nrow=3, byrow=TRUE); M
     [,1] [,2]
[1,]    2    1
[2,]    3    4
[3,]    5    6
> b <- matrix(c(0, -1, 4)); b
     [,1]
[1,]    0
[2,]   -1
[3,]    4
> I <- diag(3); I
     [,1] [,2] [,3]
[1,]    1    0    0
[2,]    0    1    0
[3,]    0    0    1
> D <- diag(c(1,2,3)); D
     [,1] [,2] [,3]
[1,]    1    0    0
[2,]    0    2    0
[3,]    0    0    3
> d <- diag(M); d
[1] 2 4

Basic information about a matrix
> dim(M)
[1] 3 2
> class(M)
[1] "matrix"
> is.matrix(M)
[1] TRUE
> is.array(M)
[1] TRUE
> is.atomic(M)
[1] TRUE
> is.vector(M)
[1] FALSE
> is.list(M)
[1] FALSE
> is.factor(M)
[1] FALSE
> is.recursive(M)
[1] FALSE
> nrow(M)
[1] 3
> ncol(M)
[1] 2
> length(M)
[1] 6
> rownames(M)
NULL
> colnames(M)
NULL

Matrix manipulation
> M <- matrix(c(2,1,3,4,5,6), nrow=3, byrow=TRUE); M
     [,1] [,2]
[1,]    2    1
[2,]    3    4
[3,]    5    6
> N <- matrix(c(6,5,4,3,2,1), nrow=3, byrow=TRUE); N
     [,1] [,2]
[1,]    6    5
[2,]    4    3
[3,]    2    1
> newM <- cbind(M, N); newM
     [,1] [,2] [,3] [,4]
[1,]    2    1    6    5
[2,]    3    4    4    3
[3,]    5    6    2    1
> newM <- rbind(M, N); newM
     [,1] [,2]
[1,]    2    1
[2,]    3    4
[3,]    5    6
[4,]    6    5
[5,]    4    3
[6,]    2    1
> v <- c(M); v
[1] 2 3 5 1 4 6
> df <- data.frame(M); df
  X1 X2
1  2  1
2  3  4
3  5  6

Matrix multiplication
> M
     [,1] [,2]
[1,]    2    1
[2,]    3    4
[3,]    5    6
> N
     [,1] [,2]
[1,]    6    5
[2,]    4    3
[3,]    2    1
> InnerProduct <- M %*% t(N); InnerProduct
     [,1] [,2] [,3]
[1,]   17   11    5
[2,]   38   24   10
[3,]   60   38   16
> OuterProduct <- M %o% N; OuterProduct
, , 1, 1

     [,1] [,2]
[1,]   12    6
[2,]   18   24
[3,]   30   36

, , 2, 1

     [,1] [,2]
[1,]    8    4
[2,]   12   16
[3,]   20   24

, , 3, 1

     [,1] [,2]
[1,]    4    2
[2,]    6    8
[3,]   10   12

, , 1, 2

     [,1] [,2]
[1,]   10    5
[2,]   15   20
[3,]   25   30

, , 2, 2

     [,1] [,2]
[1,]    6    3
[2,]    9   12
[3,]   15   18

, , 3, 2

     [,1] [,2]
[1,]    2    1
[2,]    3    4
[3,]    5    6

> CrossProduct <- crossprod(M, N); CrossProduct
     [,1] [,2]
[1,]   34   24
[2,]   34   23
> M * N
     [,1] [,2]
[1,]   12    5
[2,]   12   12
[3,]   10    6

Matrix maths
> rowMeans(M)
[1] 1.5 3.5 5.5
> colMeans(M)
[1] 3.333333 3.666667
> rowSums(M)
[1]  3  7 11
> colSums(M)
[1] 10 11
> t <- t(M);t
     [,1] [,2] [,3]
[1,]    2    3    5
[2,]    1    4    6
> inverse <- solve(diag(c(1,2,3))); inverse
     [,1] [,2]      [,3]
[1,]    1  0.0 0.0000000
[2,]    0  0.5 0.0000000
[3,]    0  0.0 0.3333333
> e <- eigen(diag(c(1,2,3))); e
$values
[1] 3 2 1

$vectors
     [,1] [,2] [,3]
[1,]    0    0    1
[2,]    0    1    0
[3,]    1    0    0

> d <- det(diag(c(1,2,3))); d
[1] 6

Matrix indexing [row, col] [[row, col]]
# [[ for single cell selection; [ for multi cell selection
# indexed by positive numbers: these ones
# indexed by negative numbers: not these
# indexed by logical atomic vector: in/out
# named rows/cols can be indexed by name
# M[i] or M[[i]] is vector-like indexing
# $ operator is invalid for atomic vectors
# M[r,]
# M[,c]

Arrays
A three dimensional array created in two steps:
> A <- 1:8;A
[1] 1 2 3 4 5 6 7 8
> dim(A) <- c(2,2,2);A
, , 1

     [,1] [,2]
[1,]    1    3
[2,]    2    4

, , 2

     [,1] [,2]
[1,]    5    7
[2,]    6    8
A matrix is a special case of array. Matrices are arrays with two dimensions.
> M <- array(1:9, dim=c(3,3));M
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9

R Basics 5 - Data Frames

Create data frame
- The R way of doing spreadsheets
- Internally, a data.frame is a list of equal length vectors or factors.
- Observations in rows; Variables in cols
empty <-data.frame()
> empty <-data.frame()
> c1 <- 1:10
> c2 <- letters[1:10]
> df <- data.frame(col1=c1, col2=c2)
> df
   col1 col2
1     1    a
2     2    b
3     3    c
4     4    d
5     5    e
6     6    f
7     7    g
8     8    h
9     9    i
10   10    j

Import from and export to file
d2 <- read.csv('fileName.csv', header = TRUE)
library(gdata);
d2 <- read.xls('file.xls')
write.csv(df, file='fileName.csv')
print(xtable(df), type='html')

Basic infomrmation about the data frame
> is.data.frame(df)
[1] TRUE
> class(df)
[1] "data.frame"
> nrow(df)
[1] 10
> ncol(df)
[1] 2
> colnames(df);
[1] "col1" "col2"
> rownames(df);
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

Referencing cells [row, col] [[r, c]]
## [[ for single cell selection;
# [ for multi cell selection;
> vec <- df[[5,2]]; vec
[1] e
Levels: a b c d e f g h i j
> newDF <- df[1:5, 1:2]; newDF
  col1 col2
1    1    a
2    2    b
3    3    c
4    4    d
5    5    e
> df[[2, 'col1']]
[1] 2
> df[3:5, c('col1', 'col2')]
  col1 col2
3    3    c
4    4    d
5    5    e

Referencing rows [r, ]
# returns a data frame ( and not a vecotr! )
> row.1 <- df[1,]; row.1
  col1 col2
1    1    a
> row.n <- df[nrow(df),]; row.n
   col1 col2
10   10    j
> vrow <- as.numeric(as.vector(df[1,])); vrow
[1] 1 1
> vrow <- as.character(as.vector(df[1,])); vrow
[1] "1" "1"

Referencing columns [,c] [d] [[d]] $col
> names(df) <- c('num','cats')
> col.vec <- df$cats; col.vec
 [1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # returns vector
> col.vec <- df[, 'cats'] ; col.vec
 [1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # a is int or string
> col.vec <- df[ , 2]; col.vec
 [1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # returns a vector
> col.vec <- df[['cats']]; col.vec
 [1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # returns 1 col df
> frog.df <- df['cats']
> # returns 1 col df
> first.df <- df[1]; first.df
   num
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
10  10
> first.col <- df[,1]; first.col
 [1]  1  2  3  4  5  6  7  8  9 10
> # returns a vector
> last.col <- df[,ncol(df)]; last.col
 [1] a b c d e f g h i j
Levels: a b c d e f g h i j

Adding rows
# The right way ... (both args are DFs)
df <- rbind(df, data.frame(num=1, cats='A')); df

Adding columns
> df$newCol <- rep(NA, nrow(df)); df
   col1 col2 newCol
1     1    a     NA
2     2    b     NA
3     3    c     NA
4     4    d     NA
5     5    e     NA
6     6    f     NA
7     7    g     NA
8     8    h     NA
9     9    i     NA
10   10    j     NA
> #Copy a column
> df[, 'copyofCol'] <- 1:nrow(df); df
   col1 col2 newCol copyofCol
1     1    a     NA         1
2     2    b     NA         2
3     3    c     NA         3
4     4    d     NA         4
5     5    e     NA         5
6     6    f     NA         6
7     7    g     NA         7
8     8    h     NA         8
9     9    i     NA         9
10   10    j     NA        10
> names(df) <- c('x','cats','newCol','y')
> df$y.percent.pf.x <- df$y/sum(df$x)*100; df
    x cats newCol  y y.percent.pf.x
1   1    a     NA  1       1.818182
2   2    b     NA  2       3.636364
3   3    c     NA  3       5.454545
4   4    d     NA  4       7.272727
5   5    e     NA  5       9.090909
6   6    f     NA  6      10.909091
7   7    g     NA  7      12.727273
8   8    h     NA  8      14.545455
9   9    i     NA  9      16.363636
10 10    j     NA 10      18.181818
> df <-cbind(col=rep('a',nrow(df)), df); df
   col  x cats newCol  y y.percent.pf.x
1    a  1    a     NA  1       1.818182
2    a  2    b     NA  2       3.636364
3    a  3    c     NA  3       5.454545
4    a  4    d     NA  4       7.272727
5    a  5    e     NA  5       9.090909
6    a  6    f     NA  6      10.909091
7    a  7    g     NA  7      12.727273
8    a  8    h     NA  8      14.545455
9    a  9    i     NA  9      16.363636
10   a 10    j     NA 10      18.181818
> df <- cbind(df,col=rep('b',nrow(df))); df
   col  x cats newCol  y y.percent.pf.x col
1    a  1    a     NA  1       1.818182   b
2    a  2    b     NA  2       3.636364   b
3    a  3    c     NA  3       5.454545   b
4    a  4    d     NA  4       7.272727   b
5    a  5    e     NA  5       9.090909   b
6    a  6    f     NA  6      10.909091   b
7    a  7    g     NA  7      12.727273   b
8    a  8    h     NA  8      14.545455   b
9    a  9    i     NA  9      16.363636   b
10   a 10    j     NA 10      18.181818   b
> df$c3 <- with(df, col3 <- x*y); df
   col  x cats newCol  y y.percent.pf.x col  c3
1    a  1    a     NA  1       1.818182   b   1
2    a  2    b     NA  2       3.636364   b   4
3    a  3    c     NA  3       5.454545   b   9
4    a  4    d     NA  4       7.272727   b  16
5    a  5    e     NA  5       9.090909   b  25
6    a  6    f     NA  6      10.909091   b  36
7    a  7    g     NA  7      12.727273   b  49
8    a  8    h     NA  8      14.545455   b  64
9    a  9    i     NA  9      16.363636   b  81
10   a 10    j     NA 10      18.181818   b 100
> transform(df, col4 <- x+y)
   col  x cats newCol  y y.percent.pf.x col  c3
1    a  1    a     NA  1       1.818182   b   1
2    a  2    b     NA  2       3.636364   b   4
3    a  3    c     NA  3       5.454545   b   9
4    a  4    d     NA  4       7.272727   b  16
5    a  5    e     NA  5       9.090909   b  25
6    a  6    f     NA  6      10.909091   b  36
7    a  7    g     NA  7      12.727273   b  49
8    a  8    h     NA  8      14.545455   b  64
9    a  9    i     NA  9      16.363636   b  81
10   a 10    j     NA 10      18.181818   b 100

Set column names # same for rownames()
> colnames(df) <- c('date', 'alpha', 'beta'); df
   date alpha beta NA NA        NA NA  NA
1     a     1    a NA  1  1.818182  b   1
2     a     2    b NA  2  3.636364  b   4
3     a     3    c NA  3  5.454545  b   9
4     a     4    d NA  4  7.272727  b  16
5     a     5    e NA  5  9.090909  b  25
6     a     6    f NA  6 10.909091  b  36
7     a     7    g NA  7 12.727273  b  49
8     a     8    h NA  8 14.545455  b  64
9     a     9    i NA  9 16.363636  b  81
10    a    10    j NA 10 18.181818  b 100
> colnames(df)[1] <- 'new.name'; df
   new.name alpha beta NA NA        NA NA  NA
1         a     1    a NA  1  1.818182  b   1
2         a     2    b NA  2  3.636364  b   4
3         a     3    c NA  3  5.454545  b   9
4         a     4    d NA  4  7.272727  b  16
5         a     5    e NA  5  9.090909  b  25
6         a     6    f NA  6 10.909091  b  36
7         a     7    g NA  7 12.727273  b  49
8         a     8    h NA  8 14.545455  b  64
9         a     9    i NA  9 16.363636  b  81
10        a    10    j NA 10 18.181818  b 100
> colnames(df)[colnames(df) %in% c('a', 'b')] <- c('x', 'y'); df
   new.name alpha beta NA NA        NA NA  NA
1         a     1    a NA  1  1.818182  b   1
2         a     2    b NA  2  3.636364  b   4
3         a     3    c NA  3  5.454545  b   9
4         a     4    d NA  4  7.272727  b  16
5         a     5    e NA  5  9.090909  b  25
6         a     6    f NA  6 10.909091  b  36
7         a     7    g NA  7 12.727273  b  49
8         a     8    h NA  8 14.545455  b  64
9         a     9    i NA  9 16.363636  b  81
10        a    10    j NA 10 18.181818  b 100

Selecting Multiple Rows
> firstTenRows <- df[1:10,]; firstTenRows
   new.name alpha beta NA NA        NA NA  NA
1         a     1    a NA  1  1.818182  b   1
2         a     2    b NA  2  3.636364  b   4
3         a     3    c NA  3  5.454545  b   9
4         a     4    d NA  4  7.272727  b  16
5         a     5    e NA  5  9.090909  b  25
6         a     6    f NA  6 10.909091  b  36
7         a     7    g NA  7 12.727273  b  49
8         a     8    h NA  8 14.545455  b  64
9         a     9    i NA  9 16.363636  b  81
10        a    10    j NA 10 18.181818  b 100
> everthingButRowTwo <- df[-2,]; everthingButRowTwo
   new.name alpha beta NA NA        NA NA  NA
1         a     1    a NA  1  1.818182  b   1
3         a     3    c NA  3  5.454545  b   9
4         a     4    d NA  4  7.272727  b  16
5         a     5    e NA  5  9.090909  b  25
6         a     6    f NA  6 10.909091  b  36
7         a     7    g NA  7 12.727273  b  49
8         a     8    h NA  8 14.545455  b  64
9         a     9    i NA  9 16.363636  b  81
10        a    10    j NA 10 18.181818  b 100
> sub <- df[(df$x >5 & y<5), ]; sub
[1] new.name alpha    beta     <NA>     <NA>     <NA>     <NA>     <NA>  
<0 rows> (or 0-length row.names)
> sub <- subset(df, x>5 & y<5); sub
[1] new.name alpha    beta     <NA>     NA.1     NA.2     NA.3     NA.4  
<0 rows> (or 0-length row.names)
> notLastRow <- head(df, -1); notLastRow
  new.name alpha beta NA NA        NA NA NA
1        a     1    a NA  1  1.818182  b  1
2        a     2    b NA  2  3.636364  b  4
3        a     3    c NA  3  5.454545  b  9
4        a     4    d NA  4  7.272727  b 16
5        a     5    e NA  5  9.090909  b 25
6        a     6    f NA  6 10.909091  b 36
7        a     7    g NA  7 12.727273  b 49
8        a     8    h NA  8 14.545455  b 64
9        a     9    i NA  9 16.363636  b 81
> df[-nrow(df),]
  new.name alpha beta NA NA        NA NA NA
1        a     1    a NA  1  1.818182  b  1
2        a     2    b NA  2  3.636364  b  4
3        a     3    c NA  3  5.454545  b  9
4        a     4    d NA  4  7.272727  b 16
5        a     5    e NA  5  9.090909  b 25
6        a     6    f NA  6 10.909091  b 36
7        a     7    g NA  7 12.727273  b 49
8        a     8    h NA  8 14.545455  b 64
9        a     9    i NA  9 16.363636  b 81

Selecting multiple columns
> df <- df[,c(1,2,3,4,5)]; df
   col  x cats newCol  y
1    a  1    a     NA  1
2    a  2    b     NA  2
3    a  3    c     NA  3
4    a  4    d     NA  4
5    a  5    e     NA  5
6    a  6    f     NA  6
7    a  7    g     NA  7
8    a  8    h     NA  8
9    a  9    i     NA  9
10   a 10    j     NA 10
> names(df) <- c('col1', 'col2', 'col3')
> df <- df[,c('col1','col2')];df
   col1 col2
1     a    1
2     a    2
3     a    3
4     a    4
5     a    5
6     a    6
7     a    7
8     a    8
9     a    9
10    a   10
df <- df[,-1]; df
# drop col1 and col3
df <- df[,-c(1,3)]
  could not find function "colnmaes"
> df <- df[,!(colnames(df) %in% c('notThis','norThis'))]
> df
   col1 col2
1     a    1
2     a    2
3     a    3
4     a    4
5     a    5
6     a    6
7     a    7
8     a    8
9     a    9
10    a   10

Replace column elements by row selection
> df
   col1 col2
1     a    1
2     a    2
3     a    3
4     a    4
5     a    5
6     a    6
7     a    7
8     a    8
9     a    9
10    a   10
> df[df$col31 == 'a', 'col2'] <- 1
> df
   col1 col2
1     a    1
2     a    2
3     a    3
4     a    4
5     a    5
6     a    6
7     a    7
8     a    8
9     a    9
10    a   10
> df[df$col1 == 'a', 'col2'] <- 1
> df
   col1 col2
1     a    1
2     a    1
3     a    1
4     a    1
5     a    1
6     a    1
7     a    1
8     a    1
9     a    1
10    a    1

Missing data(NA)
# detect anywhere in df
> any(is.na(df))
[1] TRUE
> # anywhere in col
> any(is.na(df$newCol))
[1] FALSE
> # deleting selecting missing row
> df2 <- df[!is.na(df$newCol),]; df2
   col1 col2 newCol col
1     a   NA      0   0
2     a   NA      0   0
3     a   NA      0   0
4     a   NA      0   0
5     a   NA      0   0
6     a   NA      0   0
7     a   NA      0   0
8     a   NA      0   0
9     a   NA      0   0
10    a   NA      0   0
> # replacing NAs with somthing else
> df[is.na(df)] <- 0; df
   col1 col2 newCol col
1     a    0      0   0
2     a    0      0   0
3     a    0      0   0
4     a    0      0   0
5     a    0      0   0
6     a    0      0   0
7     a    0      0   0
8     a    0      0   0
9     a    0      0   0
10    a    0      0   0
> df$col[is.na(df$col2)] <- 0; df
   col1 col2 newCol col
1     a    0      0   0
2     a    0      0   0
3     a    0      0   0
4     a    0      0   0
5     a    0      0   0
6     a    0      0   0
7     a    0      0   0
8     a    0      0   0
9     a    0      0   0
10    a    0      0   0
> df$col2 <- ifelse(is.na(df$col2), 0, df$col); df
   col1 col2 newCol col
1     a    0      0   0
2     a    0      0   0
3     a    0      0   0
4     a    0      0   0
5     a    0      0   0
6     a    0      0   0
7     a    0      0   0
8     a    0      0   0
9     a    0      0   0
10    a    0      0   0
df <- orig[!is.na(orig$series), c('Date, series')]

Traps
1 for loops on possibly empty df's, use: for( in in seq_len(nrow(df))
2 columns coerced to factors, avoid with the argument stringsAsFactor=FALSE
3 confusing row numbers and rows with numbered names(hint: avoid row names)
4 although rbind() accepts vectors and lists; this can fail with factor cols

Wednesday, November 4, 2015

R Basics 4 - Lists

Context: R has two types of vector

Atomic vectors contain values
These values are all of the same type.
They are arranged contiguously.
Atomic vectors cannot contain objects. 
There are six types of atomic vector: raw, logical, integer, numeic, complex and character.

Recursive vectors contain objects
R has two types of recursive vector: : list, expression.

Lists
- At top level: 1-dim indexed object that contains objects (not values)
- Indexed from 1 to length(list)
- Contents can be of different types
- Lists can contain the NULL object
- Deeply nested listed of lists possible
- Can be arbitrarily extended (not fixed)

List creation: usually using list()
> l1 <- list('cat', 5, 1:10, FALSE);l1
[[1]]
[1] "cat"

[[2]]
[1] 5

[[3]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[4]]
[1] FALSE

> l2 <- list ( x='dog', y=5+2i, z=3:8 );l2
$x
[1] "dog"

$y
[1] 5+2i

$z
[1] 3 4 5 6 7 8

> l3 <- c(l1, l2);l3
[[1]]
[1] "cat"

[[2]]
[1] 5

[[3]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[4]]
[1] FALSE

$x
[1] "dog"

$y
[1] 5+2i

$z
[1] 3 4 5 6 7 8

> l4 <- list(l1, l2);l4
[[1]]
[[1]][[1]]
[1] "cat"

[[1]][[2]]
[1] 5

[[1]][[3]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[1]][[4]]
[1] FALSE


[[2]]
[[2]]$x
[1] "dog"

[[2]]$y
[1] 5+2i

[[2]]$z
[1] 3 4 5 6 7 8


> l5 <- as.list( c(1,2,3));l5
[[1]]
[1] 1

[[2]]
[1] 2

[[3]]
[1] 3

> origL <- l4
> inserVorL <- l5
> position <- 3
> l6 <- append(origL, inserVorL, position);l6
[[1]]
[[1]][[1]]
[1] "cat"

[[1]][[2]]
[1] 5

[[1]][[3]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[1]][[4]]
[1] FALSE


[[2]]
[[2]]$x
[1] "dog"

[[2]]$y
[1] 5+2i

[[2]]$z
[1] 3 4 5 6 7 8


[[3]]
[1] 1

[[4]]
[1] 2

[[5]]
[1] 3

Basic information about lists
> dim(l)
NULL
> is.list(l)
[1] TRUE
> is.vector(l)
[1] TRUE
> is.recursive(l)
[1] TRUE
> is.atomic(l)
[1] FALSE
> is.factor(l)
[1] FALSE
> length(l)
[1] 5
> names(l)
NULL
> mode(l)
[1] "list"
> class(l)
[1] "list"
> typeof(l)
[1] "list"
> attributes(l)
NULL

The contents of a list
> print(l)
[[1]]
[[1]][[1]]
[1] "cat"

[[1]][[2]]
[1] 5

[[1]][[3]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[1]][[4]]
[1] FALSE


[[2]]
[[2]]$x
[1] "dog"

[[2]]$y
[1] 5+2i

[[2]]$z
[1] 3 4 5 6 7 8


[[3]]
[1] 1

[[4]]
[1] 2

[[5]]
[1] 3

> str(l)
List of 5
 $ :List of 4
  ..$ : chr "cat"
  ..$ : num 5
  ..$ : int [1:10] 1 2 3 4 5 6 7 8 9 10
  ..$ : logi FALSE
 $ :List of 3
  ..$ x: chr "dog"
  ..$ y: cplx 5+2i
  ..$ z: int [1:6] 3 4 5 6 7 8
 $ : num 1
 $ : num 2
 $ : num 3
> dput(l)
list(list("cat", 5, 1:10, FALSE), structure(list(x = "dog", y = 5+2i, 
    z = 3:8), .Names = c("x", "y", "z")), 1, 2, 3)
> head(l)
[[1]]
[[1]][[1]]
[1] "cat"

[[1]][[2]]
[1] 5

[[1]][[3]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[1]][[4]]
[1] FALSE


[[2]]
[[2]]$x
[1] "dog"

[[2]]$y
[1] 5+2i

[[2]]$z
[1] 3 4 5 6 7 8


[[3]]
[1] 1

[[4]]
[1] 2

[[5]]
[1] 3

> tail(l)
[[1]]
[[1]][[1]]
[1] "cat"

[[1]][[2]]
[1] 5

[[1]][[3]]
 [1]  1  2  3  4  5  6  7  8  9 10

[[1]][[4]]
[1] FALSE


[[2]]
[[2]]$x
[1] "dog"

[[2]]$y
[1] 5+2i

[[2]]$z
[1] 3 4 5 6 7 8


[[3]]
[1] 1

[[4]]
[1] 2

[[5]]
[1] 3

Trap: cat(x) does not work with lists

Indexing [ versus [[ versus $
- Use [ to get/set multiple items at once
Note: [ always returns a list
- Use [[ and $ to get/set a specific item
- $ only works with named list items
all same: $name $"name" $'name' $`name`
- indexed by positive numbers: these ones
- indexed by negative numbers: not these
- indexed by logical atomic vector: in/out
an empty index l[] returns the list
Tip: When using lists, most of the time you wnat ot index with [[ or $; and avoid [

Indexing examples: one-dimension get
> j <- list(a='cat', b=5, c=FALSE)
> x <- j$a;x
[1] "cat"
> x <- j[['a']];x
[1] "cat"
> x <- j['a'];x
$a
[1] "cat"
> x <- j[[1]];x
[1] "cat"
> x <- j[1];x
$a
[1] "cat"

Indexing examples: set operations
- Start with example data
l <- list(x='a', y='b', z='c', t='d')
- Next use [[ or $ because specific selection
> l[[6]] <- 'new';
> names(l)[5] <- 'w'
> l$w <- 'new-W'
> l[['w']] <- 'dog'
- Change named values: note order ignored
> l[names(l) %in% c('t', 'x')] <- c(1,2)
> l
$x
[1] 1

$y
[1] "b"

$z
[1] "c"

$t
[1] 2

$w
[1] "dog"

[[6]]
[1] "new"

Indexing example: multi-dimension get
- Indexing evaluated from left to right
- Let's start with some example data...
> i <- c('aa', 'bb', 'cc')
> j <- list(a='cat', b=5, c=FALSE)
> k <- list(i, j);k
[[1]]
[1] "aa" "bb" "cc"

[[2]]
[[2]]$a
[1] "cat"

[[2]]$b
[1] 5

[[2]]$c
[1] FALSE


> k[[1]]
[1] "aa" "bb" "cc"
> k[[2]]
$a
[1] "cat"

$b
[1] 5

$c
[1] FALSE

> k[1]
[[1]]
[1] "aa" "bb" "cc"

> k[2]
[[1]]
[[1]]$a
[1] "cat"

[[1]]$b
[1] 5

[[1]]$c
[1] FALSE

> x <- k[[1]][[1]];x
[1] "aa"
> x <- k[[1]][[2]];x
[1] "bb"
> x <- k[1][1][1][1][1];x
[[1]]
[1] "aa" "bb" "cc"

> x <- k[1][2];x
[[1]]
NULL

> x <- k[[2]][1];x
$a
[1] "cat"

List manipulation
1 Arithmetic operators cannot be applied to lists (as content types can vary)
2 Use the apply() functions to apply a function of each element in a list:
> x <- list(a=1, b=month.abb, c=letters)
> lapply(x, FUN=length)
$a
[1] 1

$b
[1] 12

$c
[1] 26

> sapply(x, FUN=length)
 a  b  c 
 1 12 26 
y <- list(a=1, b=3, c=3, c=4)
sapply(y, FUN=function=(x,p) x^p, p=2)
sapply(y, FUN=function=(x,p) x^p, p=2:3)
3 Use unlist to convert list ot vector
> unlist(x)
    a    b1    b2    b3    b4    b5    b6    b7    b8    b9   b10   b11   b12    c1    c2    c3    c4    c5    c6    c7 
  "1" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"   "a"   "b"   "c"   "d"   "e"   "f"   "g" 
   c8    c9   c10   c11   c12   c13   c14   c15   c16   c17   c18   c19   c20   c21   c22   c23   c24   c25   c26 
  "h"   "i"   "j"   "k"   "l"   "m"   "n"   "o"   "p"   "q"   "r"   "s"   "t"   "u"   "v"   "w"   "x"   "y"   "z" 

unlist wont unlist non-atomic 
4 Remove NULL objects from a list
> z <- list(a=1:9, b=letters, c=NULL)
> zNoNull <- Filter(Negate(is.null), z)
> zNoNull
$a
[1] 1 2 3 4 5 6 7 8 9

$b
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"

5 Use named lists to return multiple values

6 Factor index treated as integer
decode with v[as.character(f)]

Blog Archive