rate1 <- read.table(text = "
rank rank.1 numberc lift
1 0.000028 0.000008 1 2284.907818
2 0.000036 0.000012 2 4483.611567
3 0.000278 0.000081 3 5073.756150
4 0.000636 0.000155 4 5354.705083
5 0.000706 0.000168 5 6457.000354
6 0.000942 0.000203 6 7075.458949
7 0.054982 0.003164 7 988.025500
8 0.055152 0.003198 8 1117.996648
9 0.075488 0.006454 9 641.864357
10 6.618558 0.239828 10 18.801391
11 10.047460 0.366477 11 13.258208
12 11.604464 0.443113 12 11.865461
13 13.818272 0.560422 13 10.021142
14 14.624862 0.658144 14 9.107123
15 18.287068 0.824674 15 7.642814
16 20.369262 1.046954 16 6.262066
17 21.784700 1.195478 17 5.757480
18 23.421508 1.453525 18 4.884909
19 23.512264 1.472164 19 5.133203
20 23.751196 1.509041 20 5.298250",
header = TRUE,sep = "",row.names = 1)
write.csv(rate1, "tmp.csv")
rate9 <- read.table(text = "
rank rank.1 numberc lift
1 0.002650 0.001006 1 393.25276
2 0.014656 0.005053 2 179.98946
3 0.023782 0.007999 3 172.92444
4 0.045300 0.014959 4 124.42221
5 0.060070 0.019574 5 119.18530
6 0.121250 0.037631 6 74.38297
7 0.121330 0.037635 7 86.93661
8 0.126386 0.039033 8 95.91715
9 0.250570 0.059789 9 70.30673
10 0.482642 0.119722 10 38.63316
11 0.710086 0.176313 11 28.61911
12 0.790464 0.198090 12 27.76305
13 1.886340 0.365539 13 15.89373
14 1.894954 0.367702 14 17.08624
15 2.930882 0.558398 15 11.76272
16 3.221640 0.619175 16 11.27770
17 3.428724 0.654649 17 11.33837
18 4.072348 0.752404 18 10.36725
19 9.085542 1.513426 19 4.96601
20 13.597818 2.305283 20 3.12303",
header = TRUE,sep = "",row.names = 1)
write.csv(rate9, "tmp.csv")
## Create a formula for a model with a large number of variables:
xname <- paste("x", 1:25, sep=" ")
fmla <- as.formula(paste("y ~ ", paste(xname, collapse= "+")))
## 1 Loading text files of a reasonable size
library('hflights')
write.csv(hflights, 'hflights.csv', row.names = FALSE)
str(hflights)
system.time(read.csv('hflights.csv'))
colClasses <- sapply(hflights, class)
system.time(read.csv('hflights.csv', colClasses = colClasses))
system.time(read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ') )
install.packages('microbenchmark')
library(microbenchmark)
f <- function() read.csv('hflights.csv')
g <- function() read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ')
res <- microbenchmark(f(), g())
res
boxplot(res, xlab = '', main = expression(paste('Benchmarking'), italic('read.table')))
## Data files larger than the physical memory
install.packages('sqldf')
library(sqldf)
system.time(read.csv.sql('hflights.csv'))
install.packages('ff')
library(ff)
system.time(read.csv.ffdf(file='hflights.csv'))
install.packages('bigmemory')
library(bigmemory)
system.time(read.big.matrix('hflights.csv', header = TRUE))
## Benchmarking text file parsers
install.packages('data.table')
library(data.table)
system.time(dt <- fread('hflights.csv'))
df <- as.data.frame(dt)
is.data.frame(dt)
## Loading a subset of text files
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = '\"BNA\"'")
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = 'BNA'", filter = 'tr -d ^\\" ')
## Filtering flat files before loading to R
system.time(system('cat hflights.csv | grep BNA ', intern = TRUE))
sqldf("attach 'hflights_db' as new" )
read.csv.sql('hflights.csv', sql='create table hflights1 as select * from file', dbname = 'hflights_db')
system.time(df <- sqldf(sql = "select * from hflights1 where Dest = '\"BNA\"' ", dbname = "hflights_db"))
xname <- paste("x", 1:25, sep=" ")
fmla <- as.formula(paste("y ~ ", paste(xname, collapse= "+")))
## 1 Loading text files of a reasonable size
library('hflights')
write.csv(hflights, 'hflights.csv', row.names = FALSE)
str(hflights)
system.time(read.csv('hflights.csv'))
colClasses <- sapply(hflights, class)
system.time(read.csv('hflights.csv', colClasses = colClasses))
system.time(read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ') )
install.packages('microbenchmark')
library(microbenchmark)
f <- function() read.csv('hflights.csv')
g <- function() read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ')
res <- microbenchmark(f(), g())
res
boxplot(res, xlab = '', main = expression(paste('Benchmarking'), italic('read.table')))
## Data files larger than the physical memory
install.packages('sqldf')
library(sqldf)
system.time(read.csv.sql('hflights.csv'))
install.packages('ff')
library(ff)
system.time(read.csv.ffdf(file='hflights.csv'))
install.packages('bigmemory')
library(bigmemory)
system.time(read.big.matrix('hflights.csv', header = TRUE))
## Benchmarking text file parsers
install.packages('data.table')
library(data.table)
system.time(dt <- fread('hflights.csv'))
df <- as.data.frame(dt)
is.data.frame(dt)
## Loading a subset of text files
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = '\"BNA\"'")
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = 'BNA'", filter = 'tr -d ^\\" ')
## Filtering flat files before loading to R
system.time(system('cat hflights.csv | grep BNA ', intern = TRUE))
sqldf("attach 'hflights_db' as new" )
read.csv.sql('hflights.csv', sql='create table hflights1 as select * from file', dbname = 'hflights_db')
system.time(df <- sqldf(sql = "select * from hflights1 where Dest = '\"BNA\"' ", dbname = "hflights_db"))
No comments:
Post a Comment