Thursday, March 6, 2014

data.table in R

require(plyr)
require(data.table)

# Making a data.table from vectors
dt.data <- data.table(a=1:5,b=1:5)

rm(list=ls(all=T))
setwd("DSP data");
data <- fread("out1000.csv", colClasses=c("character", "integer", "integer", "integer", "integer") );
setnames(data, c("user_id", "date", "hour", "click", "action"));

# regular expression
regexp <- "([[:digit:]]+)"
month <- as.integer(str_extract(data$date, regexp))

# automatically sort by key
setkey(data, user_id, date, hour)

# add cols
data[,id:=1:length(data$user_id)];
data[,bcook:=as.factor(data$user_id)];
hflights_dt <- data.table(hflights)
hflights_dt[, DistanceKMs := Distance / 0.62137]
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
hflights_dt[, c('DistanceKMs', 'DistanceFeets') := list(Distance/0.62137, Distance * 5280)];

# filter rows
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
rows = sample(1:n,n*.0001);
results[,rowid:=1:n];
samples=results[results$rowid %in% rows];
test <- data[user_id == users[1],];

# filter cols
test <- data[,c(2,3,4,5,6), with=FALSE]; #filter cols
train3 <- train3[,ScoreLevel:=NULL] #drop cols

# Basic Stat
system.time(
  stats <- ddply(data, ~ date,
                 summarize, imp=length(imp), click=sum(click), action=sum(action))
)
system.time(
stats <- data[,list(imps=.N,
                    clicks=sum(click),
                    actions=sum(action)), by='date'];
)

#Drop outliers
data2 <- read.csv(file="data_extracted_filtered.txt",head=TRUE,sep=",",row.names=NULL,as.is=TRUE);
for (i in c(2:7)){
  print('----------------------------------------------------------');
  print(i);
  data2 <- data2[data2[,i]<=quantile[i-1],]
}
quantile = rep(0, 6);
for (i in c(2:7)){
  print('----------------------------------------------------------');
  print(i);
  quantile[i-1] <- max(data2[,i], .95)
}

# Apply same function to all cols
func1 <- function(x){
return(log(x+1)/log(max(x)))
}
data3 <- data.table(data2)
data3 <- data3[,id:=NULL]
data3[, 1:6 := lapply(.SD, func1), .SDcols = 1:6]

# Basic Merge
keycols= c("start_dt","variable")
setkeyv(top10thisyearlong, keycols)
setkeyv(top10lastyearlong, keycols)
top10all = merge(top10thisyearlong, top10lastyearlong, by=c("start_dt", "variable"))

# compute freq1, and add a new column with name of 'freq1' to the original table
users <- unique(data$user_id[data$date %in% c(16103, 16104, 16105)]);
setkey(data,bcook);
f_freq <- function(dd) {
  setkey(dd,bcook);
  user.merge <- merge(data, dd, by=c('bcook'));
  user.merge <- user.merge[id.x>id.y, ];
  user.merge$gap <- (user.merge$date.x - user.merge$date.y)*24 + (user.merge$hour.x - user.merge$hour.y);
  freq<- user.merge[, list(freq1d=sum(gap<24)), by=id.x];
  return(c(0, freq$freq1d));
}

sink("myfile", append=FALSE, split=FALSE);
train1[HasClick==-1, HasClick:=0];
idx <- names(train1)
for (i in c(1:36)){
  print('----------------------------------------------------------');
  print(i);
  print(train1[,list(Impressions=.N, Clicks=sum(HasClick), CTR=sum(HasClick)/.N), by=eval(idx[i])][order(-CTR)]);
}
# return output to the terminal
sink()

## Merge multiple data frames
list.of.data.frames = list(dat1, dat2, dat3, dat4, dat5, dat6, dat7)
merged.data.frame = Reduce(function(...) merge(..., all=T), list.of.data.frames)
tail(merged.data.frame)

No comments:

Post a Comment