require(data.table)
# Making a data.table from vectors
dt.data <- data.table(a=1:5,b=1:5)
rm(list=ls(all=T))
setwd("DSP data");
data <- fread("out1000.csv", colClasses=c("character", "integer", "integer", "integer", "integer") );
setnames(data, c("user_id", "date", "hour", "click", "action"));
# regular expression
regexp <- "([[:digit:]]+)"
month <- as.integer(str_extract(data$date, regexp))
# automatically sort by key
setkey(data, user_id, date, hour)
# add cols
data[,id:=1:length(data$user_id)];
data[,bcook:=as.factor(data$user_id)];
hflights_dt <- data.table(hflights)
hflights_dt[, DistanceKMs := Distance / 0.62137]
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
hflights_dt[, c('DistanceKMs', 'DistanceFeets') := list(Distance/0.62137, Distance * 5280)];
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
rows = sample(1:n,n*.0001);
results[,rowid:=1:n];
samples=results[results$rowid %in% rows];
test <- data[user_id == users[1],];
# filter cols
test <- data[,c(2,3,4,5,6), with=FALSE]; #filter cols
train3 <- train3[,ScoreLevel:=NULL] #drop cols
# Basic Stat
system.time(
stats <- ddply(data, ~ date,
summarize, imp=length(imp), click=sum(click), action=sum(action))
)
system.time(
stats <- data[,list(imps=.N,
clicks=sum(click),
actions=sum(action)), by='date'];
)
#Drop outliers
data2 <- read.csv(file="data_extracted_filtered.txt",head=TRUE,sep=",",row.names=NULL,as.is=TRUE);
for (i in c(2:7)){
print('----------------------------------------------------------');
print(i);
data2 <- data2[data2[,i]<=quantile[i-1],]
}
quantile = rep(0, 6);
for (i in c(2:7)){
print('----------------------------------------------------------');
print(i);
quantile[i-1] <- max(data2[,i], .95)
}
# Apply same function to all cols
func1 <- function(x){
return(log(x+1)/log(max(x)))
}
data3 <- data.table(data2)
data3 <- data3[,id:=NULL]
data3[, 1:6 := lapply(.SD, func1), .SDcols = 1:6]
keycols= c("start_dt","variable")
setkeyv(top10thisyearlong, keycols)
setkeyv(top10lastyearlong, keycols)
top10all = merge(top10thisyearlong, top10lastyearlong, by=c("start_dt", "variable"))
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
setkey(data,bcook);
f_freq <- function(dd) {
setkey(dd,bcook);
user.merge <- merge(data, dd, by=c('bcook'));
user.merge <- user.merge[id.x>id.y, ];
user.merge$gap <- (user.merge$date.x - user.merge$date.y)*24 + (user.merge$hour.x - user.merge$hour.y);
freq<- user.merge[, list(freq1d=sum(gap<24)), by=id.x];
return(c(0, freq$freq1d));
}
sink("myfile", append=FALSE, split=FALSE);
train1[HasClick==-1, HasClick:=0];
idx <- names(train1)
for (i in c(1:36)){
print('----------------------------------------------------------');
print(i);
print(train1[,list(Impressions=.N, Clicks=sum(HasClick), CTR=sum(HasClick)/.N), by=eval(idx[i])][order(-CTR)]);
}
# return output to the terminal
sink()
## Merge multiple data frames
list.of.data.frames = list(dat1, dat2, dat3, dat4, dat5, dat6, dat7)
merged.data.frame = Reduce(function(...) merge(..., all=T), list.of.data.frames)
tail(merged.data.frame)
No comments:
Post a Comment