我有一个大数据集,如下所示:
> str(citibank)
spec_tbl_df [33,912,996 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ tripduration : num [1:33912996] 680 1282 648 631 621 ...
$ starttime : POSIXct[1:33912996], format: "2017-01-01 00:00:21" "2017-01-01 00:00:45" "2017-01-01 00:00:57" "2017-01-01 00:01:10" ...
$ stoptime : POSIXct[1:33912996], format: "2017-01-01 00:11:41" "2017-01-01 00:22:08" "2017-01-01 00:11:46" "2017-01-01 00:11:42" ...
$ start.station.id : num [1:33912996] 3226 3263 3143 3143 3143 ...
$ start.station.name : chr [1:33912996] "W 82 St & Central Park West" "Cooper Square & E 7 St" "5 Ave & E 78 St" "5 Ave & E 78 St" ...
$ start.station.latitude : num [1:33912996] 40.8 40.7 40.8 40.8 40.8 ...
$ start.station.longitude: num [1:33912996] -74 -74 -74 -74 -74 ...
$ end.station.id : num [1:33912996] 3165 498 3152 3152 3152 ...
$ end.station.name : chr [1:33912996] "Central Park West & W 72 St" "Broadway & W 32 St" "3 Ave & E 71 St" "3 Ave & E 71 St" ...
$ end.station.latitude : num [1:33912996] 40.8 40.7 40.8 40.8 40.8 ...
$ end.station.longitude : num [1:33912996] -74 -74 -74 -74 -74 ...
$ bikeid : num [1:33912996] 25542 21136 18147 21211 26819 ...
$ usertype : chr [1:33912996] "Subscriber" "Subscriber" "Customer" "Customer" ...
$ birth.year : int [1:33912996] 1965 1987 NA NA NA 2000 1973 1977 1989 1980 ...
$ gender : num [1:33912996] 2 2 0 0 0 1 1 2 1 1 ...
我使用以下dplyr
函数来计算通勤者在两个位置start.station.id
和end.station.id
之间的旅行次数:
grouped = citibank %>%
group_by(start.station.id, end.station.id) %>%
summarise(count = n())
并得到如下结果表:
> grouped
# A tibble: 407,790 × 3
# Groups: start.station.id [866]
start.station.id end.station.id count
<dbl> <dbl> <int>
1 72 72 1621
2 72 79 94
3 72 82 6
4 72 116 49
5 72 127 455
6 72 128 121
7 72 143 5
8 72 144 1
9 72 146 57
10 72 147 7
# … with 407,780 more rows
现在,我想将citibank
数据集中的列start.station.latitude
和end.station.latitude
添加到grouped
tibble中,以便它们对应于citibank
数据集中各自的start.station.id
end.station.id
值。
有人能帮我做这个吗?提前谢谢。
只需选择相关列,使用add_count
,并获取唯一的行:
citibank %>%
select(start.station.id, end.station.id, start.station.latitude, end.station.latitude) %>%
add_count(start.station.id, end.station.id) %>%
distinct()
另一个选项是将left_join
添加到管道中,以将其他列添加回.
library(tidyverse)
citibank %>%
group_by(start.station.id, end.station.id) %>%
summarise(count = n()) %>%
left_join(., citibank %>% select(start.station.id, end.station.id, start.station.latitude, end.station.latitude), by = c("start.station.id", "end.station.id"))
输出
start.station.id end.station.id count start.station.latitude end.station.latitude
<dbl> <dbl> <int> <dbl> <dbl>
1 3143 3152 3 40.8 40.8
2 3143 3152 3 40.8 40.8
3 3143 3152 3 40.8 40.8
4 3163 3163 1 40.8 40.8
5 3226 3165 1 40.8 40.8
6 3263 498 1 40.7 40.7
数据
citibank <- structure(list(tripduration = c(680, 1282, 648, 631, 621, 666
), starttime = structure(c(1483228821, 1483228845, 1483228857,
1483228870, 1483228885, 1483228911), tzone = "UTC", class = c("POSIXct",
"POSIXt")), stoptime = structure(c(1483229501, 1483230128, 1483229506,
1483229502, 1483229507, 1483229577), tzone = "UTC", class = c("POSIXct",
"POSIXt")), start.station.id = c(3226, 3263, 3143, 3143, 3143,
3163), start.station.name = c("W 82 St & Central Park West",
"Cooper Square & E 7 St", "5 Ave & E 78 St", "5 Ave & E 78 St",
"5 Ave & E 78 St", "Central Park West & W 68 St"), start.station.latitude = c(40.78275,
40.7292364991001, 40.7768286343997, 40.7768286343997, 40.7768286343997,
40.7734066), start.station.longitude = c(-73.97137, -73.9908680319786,
-73.9638876914978, -73.9638876914978, -73.9638876914978, -73.97782542
), end.station.id = c(3165, 498, 3152, 3152, 3152, 3163), end.station.name = c("Central Park West & W 72 St",
"Broadway & W 32 St", "3 Ave & E 71 St", "3 Ave & E 71 St", "3 Ave & E 71 St",
"Central Park West & W 68 St"), end.station.latitude = c(40.7757937668367,
40.74854862, 40.76873687, 40.76873687, 40.76873687, 40.7734066
), end.station.longitude = c(-73.9762057363987, -73.98808416,
-73.96119945, -73.96119945, -73.96119945, -73.97782542), bikeid = c(25542,
21136, 18147, 21211, 26819, 16050), usertype = c("Subscriber",
"Subscriber", "Customer", "Customer", "Customer", "Subscriber"
), birth.year = c(1965L, 1987L, NA, NA, NA, 2000L), gender = c(2,
2, 0, 0, 0, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
第三个解决方案应该有效-将所有四个值添加到count
:D
grouped <- citibank %>%
count(start.station.id,
end.station.id,
start.station.latitude,
end.station.latitude)