My DS Coding Bolg: Master R 1

# Readin txt
rate1 <- read.table(text = "
     rank   rank.1 numberc        lift
     1   0.000028 0.000008       1 2284.907818
     2   0.000036 0.000012       2 4483.611567
     3   0.000278 0.000081       3 5073.756150
     4   0.000636 0.000155       4 5354.705083
     5   0.000706 0.000168       5 6457.000354
     6   0.000942 0.000203       6 7075.458949
     7   0.054982 0.003164       7 988.025500
     8   0.055152 0.003198       8 1117.996648
     9   0.075488 0.006454       9 641.864357
     10 6.618558 0.239828      10   18.801391
     11 10.047460 0.366477      11   13.258208
     12 11.604464 0.443113      12   11.865461
     13 13.818272 0.560422      13   10.021142
     14 14.624862 0.658144      14    9.107123
     15 18.287068 0.824674      15    7.642814
     16 20.369262 1.046954      16    6.262066
     17 21.784700 1.195478      17    5.757480
     18 23.421508 1.453525      18    4.884909
     19 23.512264 1.472164      19    5.133203
     20 23.751196 1.509041      20    5.298250",
header = TRUE,sep = "",row.names = 1)
write.csv(rate1, "tmp.csv")

rate9 <- read.table(text = "
     rank   rank.1 numberc      lift
     1   0.002650 0.001006       1 393.25276
     2   0.014656 0.005053       2 179.98946
     3   0.023782 0.007999       3 172.92444
     4   0.045300 0.014959       4 124.42221
     5   0.060070 0.019574       5 119.18530
     6   0.121250 0.037631       6 74.38297
     7   0.121330 0.037635       7 86.93661
     8   0.126386 0.039033       8 95.91715
     9   0.250570 0.059789       9 70.30673
     10 0.482642 0.119722      10 38.63316
     11 0.710086 0.176313      11 28.61911
     12 0.790464 0.198090      12 27.76305
     13 1.886340 0.365539      13 15.89373
     14 1.894954 0.367702      14 17.08624
     15 2.930882 0.558398      15 11.76272
     16 3.221640 0.619175      16 11.27770
     17 3.428724 0.654649      17 11.33837
     18 4.072348 0.752404      18 10.36725
     19 9.085542 1.513426      19   4.96601
     20 13.597818 2.305283      20   3.12303",
header = TRUE,sep = "",row.names = 1)
write.csv(rate9, "tmp.csv")

## Create a formula for a model with a large number of variables:
xname <- paste("x", 1:25, sep=" ")
fmla <- as.formula(paste("y ~ ", paste(xname, collapse= "+")))

## 1 Loading text files of a reasonable size
library('hflights')
write.csv(hflights, 'hflights.csv', row.names = FALSE)
str(hflights)
system.time(read.csv('hflights.csv'))

colClasses <- sapply(hflights, class)
system.time(read.csv('hflights.csv', colClasses = colClasses))
system.time(read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ') )

install.packages('microbenchmark')
library(microbenchmark)
f <- function() read.csv('hflights.csv')
g <- function() read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ')
res <- microbenchmark(f(), g())
res
boxplot(res, xlab = '', main = expression(paste('Benchmarking'), italic('read.table')))

## Data files larger than the physical memory
install.packages('sqldf')
library(sqldf)
system.time(read.csv.sql('hflights.csv'))

install.packages('ff')
library(ff)
system.time(read.csv.ffdf(file='hflights.csv'))

install.packages('bigmemory')
library(bigmemory)
system.time(read.big.matrix('hflights.csv', header = TRUE))

## Benchmarking text file parsers
install.packages('data.table')
library(data.table)
system.time(dt <- fread('hflights.csv'))
df <- as.data.frame(dt)
is.data.frame(dt)

## Loading a subset of text files
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = '\"BNA\"'")
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = 'BNA'", filter = 'tr -d ^\\" ')

## Filtering flat files before loading to R
system.time(system('cat hflights.csv | grep BNA ', intern = TRUE))
sqldf("attach 'hflights_db' as new" )
read.csv.sql('hflights.csv', sql='create table hflights1 as select * from file', dbname = 'hflights_db')
system.time(df <- sqldf(sql = "select * from hflights1 where Dest = '\"BNA\"' ", dbname = "hflights_db"))

My DS Coding Bolg

Monday, December 7, 2015

Master R 1 - Readin Data

No comments:

Post a Comment

Blog Archive