Thursday, December 10, 2015

Master R 3 - Filter and Summarize Data

# Drop needless data
library(sqldf)
sqldf("select * from mtcars where am=1 and vs=1")
sqldf("select * from mtcars limit 10")
subset(mtcars, am==1 & vs ==1)
identical(
  sqldf("select * from mtcars where am=1 and vs=1", row.names = TRUE),
  subset(mtcars, am==1 & vs==1)
          )
subset(mtcars, am ==1 & vs ==1, select = hp:wt)
subset(mtcars, am ==1 & vs ==1, select = -c(hp,wt))

# Drop needless data in an efficient way
library(hflights)
system.time(sqldf("select * from hflights where Dest=='BNA'", row.names = TRUE))
system.time(subset(hflights, Dest == 'BNA'))

install.packages("dplyr")
library(dplyr)
system.time(filter(hflights, Dest == 'BNA'))

# Drop needless data in another efficient way
library(data.table)
hflights_dt <- data.table(hflights)
hflights_dt[, rownames := rownames(hflights)]
system.time(hflights_dt[Dest == 'BNA'])

/*
  DT[i, j, ..., drop = TRUE]
  DT[where, select | update, group by][having][order by][]...[]
  */
str(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
tail(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
hflights_dt[Dest=='BNA', .(DepTime, ArrTime)]
hflights_dt[Dest=='BNA', c('DepTime', 'ArrTime'), with= FALSE]

## Aggregation
aggregate(hflights$Diverted, by=list(hflights$DayOfWeek), FUN=mean)
aggregate(Diverted ~ DayOfWeek, data=hflights, FUN=mean)
with(hflights, aggregate(Diverted, by=list(DayOfWeek), FUN=mean))

## Quicker aggregation with Base R commands
# tapply return array
# d stands for data.frame
# s stands for array
# l stands for list
# m is a special input type, which means that we provide multiple arguments in a tabular format for the function
# r input type expects an integer, which specifies the number of times replicated
#_ is a special output type that does not return anything for the function

tapply(hflights$Diverted, hflights$DayOfWeek, mean)

## Convenient helper functions
library(plyr)
ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted))
ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted))

library(dplyr)
hflights_DayOfWeek <- group_by(hflights, DayOfWeek)
dplyr::summarise(hflights_DayOfWeek, mean(Diverted))

## Aggregate with data.table
setkey(hflights_dt, 'DayOfWeek')
hflights_dt[, mean(Diverted), by = DayOfWeek]
#hflights_dt[, list('mean(Diverted') = mean(Diverted)), by=DayOfWeek]

# Summary functions
ddply(hflights, .(DayOfWeek), summarise, n=length(Diverted))
ddply(hflights, .(DayOfWeek), nrow)
table(hflights$DayOfWeek)
count(hflights, 'DayOfWeek')
attr(hflights_DayOfWeek, 'group_sizes')
hflights_dt[, .N, by = list(DayOfWeek)]

No comments:

Post a Comment

Blog Archive