https://haoeric.gitbooks.io/r-advanced/content/qu_zi_ji.html
http://adv-r.had.co.nz/Subsetting.html
取子集
测试
概要
索引类型
原子向量
x <- c(2.1, 4.2, 3.3, 5.4) #注意:小数点后面的数实际标明了向量中元素的位置。
x[c(3, 1)]
x[order(x)]# 重复的索引返回重复的值x[c(1, 1)]# 实数默认被去尾为整数x[c(2.1, 2.9)]
x[-c(3, 1)]
x[c(-1, 2)]
x[c(TRUE, TRUE, FALSE, FALSE)]
x[x > 3]
x[c(TRUE, FALSE)]# 等同于x[c(TRUE, FALSE, TRUE, FALSE)]
x[c(TRUE, TRUE, NA, FALSE)]
x[]
x[0]
(y <- setNames(x, letters[1:4]))
y[c("d", "c", "a")]# 和整数索引一样,你也可以使用重复字符串y[c("a", "a", "a")]# 使用[取子集时,名字必须是完全匹配的z <- c(abc = 1, def = 2)
z[c("a", "d")]
列表
矩阵和数组
a <- matrix(1:9, nrow = 3)
colnames(a) <- c("A", "B", "C")
a[1:2, ]
a[c(T, F, T), c("B", "A")]
a[0, -2]
vals <- outer(1:5, 1:5, FUN = "paste", sep = ",")
vals[c(4, 15)]
vals <- outer(1:5, 1:5, FUN = "paste", sep = ",")
select <- matrix(ncol = 2, byrow = TRUE, c( 1, 1, 3, 1, 2, 4))
vals[select]
数据框
df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3])
df[df$x == 2, ]
df[c(1, 3), ]# 有两种方法对一个数据框的列取子集# 同列表一样df[c("x", "z")]# 同矩阵一样df[, c("x", "z")]# 如果仅取数据框的某一列:使用同矩阵一样的方法则返回值会被简化为向量,但是使用同列表一样方法则不会简化。str(df["x"])
str(df[, "x"])
S3对象
S4对象
练习
mtcars[mtcars$cyl = 4, ]
mtcars[-1:4, ]
mtcars[mtcars$cyl <= 5]
mtcars[mtcars$cyl == 4 | 6, ]
x <- outer(1:5, 1:5, FUN = "*")
x[upper.tri(x)]
取子集操作符
a <- list(a = 1, b = 2)
a`1`
a[["a"]]# 如果[[里是一个向量则会迭代索引b <- list(a = list(b = list(c = list(d = 1))))
b[[c("a", "b", "c", "d")]]# 等同于b[["a"]][["b"]][["c"]][["d"]]
简化与保留
| 简化 | 保留 |
---|
向量 | x`1` | x[1] |
列表 | x`1` | x[1] |
因子 | x[1:4, drop = T] | x[1:4] |
数组 | x[1, ] or x[, 1] | x[1, , drop = F] or x[, 1, drop = F] |
数据框 | x[, 1] or x`1` | x[, 1, drop = F] or x[1] |
x <- c(a = 1, b = 2)
x[1]
x`1`
y <- list(a = 1, b = 2)
str(y[1])
str(y`1`)
z <- factor(c("a", "b"))
z[1]
z[1, drop = TRUE]
a <- matrix(1:4, nrow = 2)
a[1, , drop = FALSE]
a[1, ]
df <- data.frame(a = 1:2, b = 1:2)
str(df[1])
str(df`1`)
str(df[, "a", drop = FALSE])
str(df[, "a"])
$
var <- "cyl"# mtcars$var等同于mtcars[["var"]],这样返回nullmtcars$var# 换用[[mtcars`var`
x <- list(abc = 1)
x$a
x[["a"]]
缺失索引与出界索引
x <- 1:4str(x[5])
str(x[NA_real_])
str(x[NULL])
操作符 | 索引 | 原子向量 | 列表 |
---|
[ | OOB | NA | list(NULL) |
[ | NA_real_ | NA | list(NULL) |
[ | NULL | x[0] | list(NULL) |
[[ | OOB | Error | Error |
[[ | NA_real_ | Error | NULL |
[[ | NULL | Error | Error |
numeric()[1]
numeric()[NA_real_]
numeric()[NULL]
numeric()`1`
numeric()`NA_real_`
numeric()`NULL`
list()[1]
list()[NA_real_]
list()[NULL]
list()`1`
list()`NA_real_`
list()`NULL`
练习
比如一个线性模型mod <- lm(mpg ~ wt, data = mtcars)
,如何对它提取模型中的残余自由度,如何提取summary(mod)
中的R平方值。
取子集与任务分派
2
x <- 1:5x[c(1, 2)] <- 2:3x# LHS的长度必须和RHS一致x[-1] <- 4:1x# 注意:重复的索引不会被除掉,会覆盖前面的赋值x[c(1, 1)] <- 2:3x# 整型索引不能和NA一同使用x[c(1, NA)] <- c(1, 2)# 但是NA可以和逻辑索引一同使用 (这时,NA会被视为false)x[c(T, F, NA)] <- 1x# 这对修改向量中修改符合某种条件的元素很有用处df <- data.frame(a = c(1, 10, NA))
df$a[df$a < 5] <- 0df$a
mtcars[] <- lapply(mtcars, as.integer)
mtcars <- lapply(mtcars, as.integer)
x <- list(a = 1, b = 2)
x[["b"]] <- NULLstr(x)
y <- list(a = 1)
y["b"] <- list(NULL)
str(y)
实例运用
查寻表 (字符串取子集)
x <- c("m", "f", "u", "f", "f", "m", "m")
lookup <- c(m = "Male", f = "Female", u = NA)
lookup[x]
unname(lookup[x])# 或者更简单的输出c(m = "Known", f = "Known", u = "Unknown")[x]
手动匹配和融合 (×××取子集)
grades <- c(1, 2, 2, 3, 1)
info <- data.frame(
grade = 3:1,
desc = c("Excellent", "Good", "Poor"),
fail = c(F, F, T)
)
# 使用 matchid <- match(grades, info$grade)
info[id, ]# 使用 rownamesrownames(info) <- info$grade
info[as.character(grades), ]
随机取样/自助法 (整型取子集)
df <- data.frame(x = rep(1:3, each = 2), y = 6:1, z = letters[1:6])# 为可重复性操作设置种子set.seed(10)# 随机重排df[sample(nrow(df)), ]# 随机取3排df[sample(nrow(df), 3), ]# 取6个自助样本df[sample(nrow(df), 6, rep = T), ]
排序 (×××取子集)
x <- c("b", "c", "a")
order(x)
x[order(x)]
# 随机重排dfdf2 <- df[sample(nrow(df)), 3:1]
df2
df2[order(df2$x), ]
df2[, order(names(df2))]
展开汇总计数 (整型取子集)
df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1))
rep(1:nrow(df), df$n)
df[rep(1:nrow(df), df$n), ]
去除数据框中的某列 (字符串取子集)
df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3])
df$z <- NULL
df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3])
df[c("x", "y")]
df[setdiff(names(df), "z")]
有条件的行筛选 (逻辑型取子集)
mtcars[mtcars$gear == 5, ]
mtcars[mtcars$gear == 5 & mtcars$cyl == 4, ]
!(X & Y)
等同于 !X | !Y
!(X | Y)
等同于 !X & !Y
subset(mtcars, gear == 5)
subset(mtcars, gear == 5 & cyl == 4)
逻辑运算 vs. 集合运算 (逻辑型 & ×××取子集)
x <- sample(10) < 4which(x)
unwhich <- function(x, n) {
out <- rep_len(FALSE, n)
out[x] <- TRUE
out
}
unwhich(which(x), 10)
(x1 <- 1:10 %% 2 == 0)
(x2 <- which(x1))
(y1 <- 1:10 %% 5 == 0)
(y2 <- which(y1))# X & Y <-> intersect(x, y)x1 & y1
intersect(x2, y2)# X | Y <-> union(x, y)x1 | y1
union(x2, y2)# X & !Y <-> setdiff(x, y)x1 & !y1
setdiff(x2, y2)# xor(X, Y) <-> setdiff(union(x, y), intersect(x, y))xor(x1, y1)
setdiff(union(x2, y2), intersect(x2, y2))
练习
参考答案