Create data frame
- The R way of doing spreadsheets
- Internally, a data.frame is a list of equal length vectors or factors.
- Observations in rows; Variables in cols
empty <-data.frame()
> empty <-data.frame()
> c1 <- 1:10
> c2 <- letters[1:10]
> df <- data.frame(col1=c1, col2=c2)
> df
col1 col2
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 f
7 7 g
8 8 h
9 9 i
10 10 j
Import from and export to file
d2 <- read.csv('fileName.csv', header = TRUE)
library(gdata);
d2 <- read.xls('file.xls')
write.csv(df, file='fileName.csv')
print(xtable(df), type='html')
Basic infomrmation about the data frame
> is.data.frame(df)
[1] TRUE
> class(df)
[1] "data.frame"
> nrow(df)
[1] 10
> ncol(df)
[1] 2
> colnames(df);
[1] "col1" "col2"
> rownames(df);
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
Referencing cells [row, col] [[r, c]]
## [[ for single cell selection;
# [ for multi cell selection;
> vec <- df[[5,2]]; vec
[1] e
Levels: a b c d e f g h i j
> newDF <- df[1:5, 1:2]; newDF
col1 col2
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
> df[[2, 'col1']]
[1] 2
> df[3:5, c('col1', 'col2')]
col1 col2
3 3 c
4 4 d
5 5 e
Referencing rows [r, ]
# returns a data frame ( and not a vecotr! )
> row.1 <- df[1,]; row.1
col1 col2
1 1 a
> row.n <- df[nrow(df),]; row.n
col1 col2
10 10 j
> vrow <- as.numeric(as.vector(df[1,])); vrow
[1] 1 1
> vrow <- as.character(as.vector(df[1,])); vrow
[1] "1" "1"
Referencing columns [,c] [d] [[d]] $col
> names(df) <- c('num','cats')
> col.vec <- df$cats; col.vec
[1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # returns vector
> col.vec <- df[, 'cats'] ; col.vec
[1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # a is int or string
> col.vec <- df[ , 2]; col.vec
[1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # returns a vector
> col.vec <- df[['cats']]; col.vec
[1] a b c d e f g h i j
Levels: a b c d e f g h i j
> # returns 1 col df
> frog.df <- df['cats']
> # returns 1 col df
> first.df <- df[1]; first.df
num
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
> first.col <- df[,1]; first.col
[1] 1 2 3 4 5 6 7 8 9 10
> # returns a vector
> last.col <- df[,ncol(df)]; last.col
[1] a b c d e f g h i j
Levels: a b c d e f g h i j
Adding rows
# The right way ... (both args are DFs)
df <- rbind(df, data.frame(num=1, cats='A')); df
Adding columns
> df$newCol <- rep(NA, nrow(df)); df
col1 col2 newCol
1 1 a NA
2 2 b NA
3 3 c NA
4 4 d NA
5 5 e NA
6 6 f NA
7 7 g NA
8 8 h NA
9 9 i NA
10 10 j NA
> #Copy a column
> df[, 'copyofCol'] <- 1:nrow(df); df
col1 col2 newCol copyofCol
1 1 a NA 1
2 2 b NA 2
3 3 c NA 3
4 4 d NA 4
5 5 e NA 5
6 6 f NA 6
7 7 g NA 7
8 8 h NA 8
9 9 i NA 9
10 10 j NA 10
> names(df) <- c('x','cats','newCol','y')
> df$y.percent.pf.x <- df$y/sum(df$x)*100; df
x cats newCol y y.percent.pf.x
1 1 a NA 1 1.818182
2 2 b NA 2 3.636364
3 3 c NA 3 5.454545
4 4 d NA 4 7.272727
5 5 e NA 5 9.090909
6 6 f NA 6 10.909091
7 7 g NA 7 12.727273
8 8 h NA 8 14.545455
9 9 i NA 9 16.363636
10 10 j NA 10 18.181818
> df <-cbind(col=rep('a',nrow(df)), df); df
col x cats newCol y y.percent.pf.x
1 a 1 a NA 1 1.818182
2 a 2 b NA 2 3.636364
3 a 3 c NA 3 5.454545
4 a 4 d NA 4 7.272727
5 a 5 e NA 5 9.090909
6 a 6 f NA 6 10.909091
7 a 7 g NA 7 12.727273
8 a 8 h NA 8 14.545455
9 a 9 i NA 9 16.363636
10 a 10 j NA 10 18.181818
> df <- cbind(df,col=rep('b',nrow(df))); df
col x cats newCol y y.percent.pf.x col
1 a 1 a NA 1 1.818182 b
2 a 2 b NA 2 3.636364 b
3 a 3 c NA 3 5.454545 b
4 a 4 d NA 4 7.272727 b
5 a 5 e NA 5 9.090909 b
6 a 6 f NA 6 10.909091 b
7 a 7 g NA 7 12.727273 b
8 a 8 h NA 8 14.545455 b
9 a 9 i NA 9 16.363636 b
10 a 10 j NA 10 18.181818 b
> df$c3 <- with(df, col3 <- x*y); df
col x cats newCol y y.percent.pf.x col c3
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
10 a 10 j NA 10 18.181818 b 100
> transform(df, col4 <- x+y)
col x cats newCol y y.percent.pf.x col c3
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
10 a 10 j NA 10 18.181818 b 100
Set column names # same for rownames()
> colnames(df) <- c('date', 'alpha', 'beta'); df
date alpha beta NA NA NA NA NA
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
10 a 10 j NA 10 18.181818 b 100
> colnames(df)[1] <- 'new.name'; df
new.name alpha beta NA NA NA NA NA
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
10 a 10 j NA 10 18.181818 b 100
> colnames(df)[colnames(df) %in% c('a', 'b')] <- c('x', 'y'); df
new.name alpha beta NA NA NA NA NA
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
10 a 10 j NA 10 18.181818 b 100
Selecting Multiple Rows
> firstTenRows <- df[1:10,]; firstTenRows
new.name alpha beta NA NA NA NA NA
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
10 a 10 j NA 10 18.181818 b 100
> everthingButRowTwo <- df[-2,]; everthingButRowTwo
new.name alpha beta NA NA NA NA NA
1 a 1 a NA 1 1.818182 b 1
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
10 a 10 j NA 10 18.181818 b 100
> sub <- df[(df$x >5 & y<5), ]; sub
[1] new.name alpha beta <NA> <NA> <NA> <NA> <NA>
<0 rows> (or 0-length row.names)
> sub <- subset(df, x>5 & y<5); sub
[1] new.name alpha beta <NA> NA.1 NA.2 NA.3 NA.4
<0 rows> (or 0-length row.names)
> notLastRow <- head(df, -1); notLastRow
new.name alpha beta NA NA NA NA NA
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
> df[-nrow(df),]
new.name alpha beta NA NA NA NA NA
1 a 1 a NA 1 1.818182 b 1
2 a 2 b NA 2 3.636364 b 4
3 a 3 c NA 3 5.454545 b 9
4 a 4 d NA 4 7.272727 b 16
5 a 5 e NA 5 9.090909 b 25
6 a 6 f NA 6 10.909091 b 36
7 a 7 g NA 7 12.727273 b 49
8 a 8 h NA 8 14.545455 b 64
9 a 9 i NA 9 16.363636 b 81
Selecting multiple columns
> df <- df[,c(1,2,3,4,5)]; df
col x cats newCol y
1 a 1 a NA 1
2 a 2 b NA 2
3 a 3 c NA 3
4 a 4 d NA 4
5 a 5 e NA 5
6 a 6 f NA 6
7 a 7 g NA 7
8 a 8 h NA 8
9 a 9 i NA 9
10 a 10 j NA 10
> names(df) <- c('col1', 'col2', 'col3')
> df <- df[,c('col1','col2')];df
col1 col2
1 a 1
2 a 2
3 a 3
4 a 4
5 a 5
6 a 6
7 a 7
8 a 8
9 a 9
10 a 10
df <- df[,-1]; df
# drop col1 and col3
df <- df[,-c(1,3)]
could not find function "colnmaes"
> df <- df[,!(colnames(df) %in% c('notThis','norThis'))]
> df
col1 col2
1 a 1
2 a 2
3 a 3
4 a 4
5 a 5
6 a 6
7 a 7
8 a 8
9 a 9
10 a 10
Replace column elements by row selection
> df
col1 col2
1 a 1
2 a 2
3 a 3
4 a 4
5 a 5
6 a 6
7 a 7
8 a 8
9 a 9
10 a 10
> df[df$col31 == 'a', 'col2'] <- 1
> df
col1 col2
1 a 1
2 a 2
3 a 3
4 a 4
5 a 5
6 a 6
7 a 7
8 a 8
9 a 9
10 a 10
> df[df$col1 == 'a', 'col2'] <- 1
> df
col1 col2
1 a 1
2 a 1
3 a 1
4 a 1
5 a 1
6 a 1
7 a 1
8 a 1
9 a 1
10 a 1
Missing data(NA)
# detect anywhere in df
> any(is.na(df))
[1] TRUE
> # anywhere in col
> any(is.na(df$newCol))
[1] FALSE
> # deleting selecting missing row
> df2 <- df[!is.na(df$newCol),]; df2
col1 col2 newCol col
1 a NA 0 0
2 a NA 0 0
3 a NA 0 0
4 a NA 0 0
5 a NA 0 0
6 a NA 0 0
7 a NA 0 0
8 a NA 0 0
9 a NA 0 0
10 a NA 0 0
> # replacing NAs with somthing else
> df[is.na(df)] <- 0; df
col1 col2 newCol col
1 a 0 0 0
2 a 0 0 0
3 a 0 0 0
4 a 0 0 0
5 a 0 0 0
6 a 0 0 0
7 a 0 0 0
8 a 0 0 0
9 a 0 0 0
10 a 0 0 0
> df$col[is.na(df$col2)] <- 0; df
col1 col2 newCol col
1 a 0 0 0
2 a 0 0 0
3 a 0 0 0
4 a 0 0 0
5 a 0 0 0
6 a 0 0 0
7 a 0 0 0
8 a 0 0 0
9 a 0 0 0
10 a 0 0 0
> df$col2 <- ifelse(is.na(df$col2), 0, df$col); df
col1 col2 newCol col
1 a 0 0 0
2 a 0 0 0
3 a 0 0 0
4 a 0 0 0
5 a 0 0 0
6 a 0 0 0
7 a 0 0 0
8 a 0 0 0
9 a 0 0 0
10 a 0 0 0
df <- orig[!is.na(orig$series), c('Date, series')]
Traps
1 for loops on possibly empty df's, use: for( in in seq_len(nrow(df))
2 columns coerced to factors, avoid with the argument stringsAsFactor=FALSE
3 confusing row numbers and rows with numbered names(hint: avoid row names)
4 although rbind() accepts vectors and lists; this can fail with factor cols
No comments:
Post a Comment