# Drop needless data
library(sqldf)
sqldf("select * from mtcars where am=1 and vs=1")
sqldf("select * from mtcars limit 10")
subset(mtcars, am==1 & vs ==1)
identical(
sqldf("select * from mtcars where am=1 and vs=1", row.names = TRUE),
subset(mtcars, am==1 & vs==1)
)
subset(mtcars, am ==1 & vs ==1, select = hp:wt)
subset(mtcars, am ==1 & vs ==1, select = -c(hp,wt))
# Drop needless data in an efficient way
library(hflights)
system.time(sqldf("select * from hflights where Dest=='BNA'", row.names = TRUE))
system.time(subset(hflights, Dest == 'BNA'))
install.packages("dplyr")
library(dplyr)
system.time(filter(hflights, Dest == 'BNA'))
# Drop needless data in another efficient way
library(data.table)
hflights_dt <- data.table(hflights)
hflights_dt[, rownames := rownames(hflights)]
system.time(hflights_dt[Dest == 'BNA'])
/*
DT[i, j, ..., drop = TRUE]
DT[where, select | update, group by][having][order by][]...[]
*/
str(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
tail(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
hflights_dt[Dest=='BNA', .(DepTime, ArrTime)]
hflights_dt[Dest=='BNA', c('DepTime', 'ArrTime'), with= FALSE]
## Aggregation
aggregate(hflights$Diverted, by=list(hflights$DayOfWeek), FUN=mean)
aggregate(Diverted ~ DayOfWeek, data=hflights, FUN=mean)
with(hflights, aggregate(Diverted, by=list(DayOfWeek), FUN=mean))
## Quicker aggregation with Base R commands
# tapply return array
# d stands for data.frame
# s stands for array
# l stands for list
# m is a special input type, which means that we provide multiple arguments in a tabular format for the function
# r input type expects an integer, which specifies the number of times replicated
#_ is a special output type that does not return anything for the function
tapply(hflights$Diverted, hflights$DayOfWeek, mean)
## Convenient helper functions
library(plyr)
ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted))
ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted))
library(dplyr)
hflights_DayOfWeek <- group_by(hflights, DayOfWeek)
dplyr::summarise(hflights_DayOfWeek, mean(Diverted))
## Aggregate with data.table
setkey(hflights_dt, 'DayOfWeek')
hflights_dt[, mean(Diverted), by = DayOfWeek]
#hflights_dt[, list('mean(Diverted') = mean(Diverted)), by=DayOfWeek]
# Summary functions
ddply(hflights, .(DayOfWeek), summarise, n=length(Diverted))
ddply(hflights, .(DayOfWeek), nrow)
table(hflights$DayOfWeek)
count(hflights, 'DayOfWeek')
attr(hflights_DayOfWeek, 'group_sizes')
hflights_dt[, .N, by = list(DayOfWeek)]
Thursday, December 10, 2015
Subscribe to:
Post Comments (Atom)
Blog Archive
-
▼
2015
(43)
-
▼
December
(14)
- Master R 4 - Restruct Data
- Master R 3 - Filter and Summarize Data
- Master R 2 - Fetch Web Data
- Master R 1 - Readin Data
- Visual 10 - Bipartite Network Plot in R
- Visual 9 - Heat Map in R
- Visual 8 - Dot Plot and Bubble Chart
- Visual 7 - Network Graph
- Visual 6 - Density
- Visual 5 - Bar Chart and Stacked Bar Chart
- Visual 4 - Bloxplot and Donut Chart
- Visual 3 - Line Plot
- Visual 2 - Histogram
- Visual 1 - Corrlation
-
▼
December
(14)
No comments:
Post a Comment