R将大型数据帧中的列合并到摘要数据帧中



我有一个大数据集,如下所示:

> str(citibank)
spec_tbl_df [33,912,996 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ tripduration           : num [1:33912996] 680 1282 648 631 621 ...
$ starttime              : POSIXct[1:33912996], format: "2017-01-01 00:00:21" "2017-01-01 00:00:45" "2017-01-01 00:00:57" "2017-01-01 00:01:10" ...
$ stoptime               : POSIXct[1:33912996], format: "2017-01-01 00:11:41" "2017-01-01 00:22:08" "2017-01-01 00:11:46" "2017-01-01 00:11:42" ...
$ start.station.id       : num [1:33912996] 3226 3263 3143 3143 3143 ...
$ start.station.name     : chr [1:33912996] "W 82 St & Central Park West" "Cooper Square & E 7 St" "5 Ave & E 78 St" "5 Ave & E 78 St" ...
$ start.station.latitude : num [1:33912996] 40.8 40.7 40.8 40.8 40.8 ...
$ start.station.longitude: num [1:33912996] -74 -74 -74 -74 -74 ...
$ end.station.id         : num [1:33912996] 3165 498 3152 3152 3152 ...
$ end.station.name       : chr [1:33912996] "Central Park West & W 72 St" "Broadway & W 32 St" "3 Ave & E 71 St" "3 Ave & E 71 St" ...
$ end.station.latitude   : num [1:33912996] 40.8 40.7 40.8 40.8 40.8 ...
$ end.station.longitude  : num [1:33912996] -74 -74 -74 -74 -74 ...
$ bikeid                 : num [1:33912996] 25542 21136 18147 21211 26819 ...
$ usertype               : chr [1:33912996] "Subscriber" "Subscriber" "Customer" "Customer" ...
$ birth.year             : int [1:33912996] 1965 1987 NA NA NA 2000 1973 1977 1989 1980 ...
$ gender                 : num [1:33912996] 2 2 0 0 0 1 1 2 1 1 ...

我使用以下dplyr函数来计算通勤者在两个位置start.station.idend.station.id之间的旅行次数:

grouped = citibank %>% 
group_by(start.station.id, end.station.id) %>% 
summarise(count = n())

并得到如下结果表:

> grouped
# A tibble: 407,790 × 3
# Groups:   start.station.id [866]
start.station.id end.station.id count
<dbl>          <dbl> <int>
1               72             72  1621
2               72             79    94
3               72             82     6
4               72            116    49
5               72            127   455
6               72            128   121
7               72            143     5
8               72            144     1
9               72            146    57
10               72            147     7
# … with 407,780 more rows

现在,我想将citibank数据集中的列start.station.latitudeend.station.latitude添加到groupedtibble中,以便它们对应于citibank数据集中各自的start.station.idend.station.id值。

有人能帮我做这个吗?提前谢谢。

只需选择相关列,使用add_count,并获取唯一的行:

citibank %>% 
select(start.station.id, end.station.id, start.station.latitude, end.station.latitude) %>% 
add_count(start.station.id, end.station.id) %>% 
distinct()

另一个选项是将left_join添加到管道中,以将其他列添加回.

library(tidyverse)
citibank %>% 
group_by(start.station.id, end.station.id) %>% 
summarise(count = n()) %>% 
left_join(., citibank %>% select(start.station.id, end.station.id, start.station.latitude, end.station.latitude), by = c("start.station.id", "end.station.id"))

输出

start.station.id end.station.id count start.station.latitude end.station.latitude
<dbl>          <dbl> <int>                  <dbl>                <dbl>
1             3143           3152     3                   40.8                 40.8
2             3143           3152     3                   40.8                 40.8
3             3143           3152     3                   40.8                 40.8
4             3163           3163     1                   40.8                 40.8
5             3226           3165     1                   40.8                 40.8
6             3263            498     1                   40.7                 40.7

数据

citibank <- structure(list(tripduration = c(680, 1282, 648, 631, 621, 666
), starttime = structure(c(1483228821, 1483228845, 1483228857, 
1483228870, 1483228885, 1483228911), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), stoptime = structure(c(1483229501, 1483230128, 1483229506, 
1483229502, 1483229507, 1483229577), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), start.station.id = c(3226, 3263, 3143, 3143, 3143, 
3163), start.station.name = c("W 82 St & Central Park West", 
"Cooper Square & E 7 St", "5 Ave & E 78 St", "5 Ave & E 78 St", 
"5 Ave & E 78 St", "Central Park West & W 68 St"), start.station.latitude = c(40.78275, 
40.7292364991001, 40.7768286343997, 40.7768286343997, 40.7768286343997, 
40.7734066), start.station.longitude = c(-73.97137, -73.9908680319786, 
-73.9638876914978, -73.9638876914978, -73.9638876914978, -73.97782542
), end.station.id = c(3165, 498, 3152, 3152, 3152, 3163), end.station.name = c("Central Park West & W 72 St", 
"Broadway & W 32 St", "3 Ave & E 71 St", "3 Ave & E 71 St", "3 Ave & E 71 St", 
"Central Park West & W 68 St"), end.station.latitude = c(40.7757937668367, 
40.74854862, 40.76873687, 40.76873687, 40.76873687, 40.7734066
), end.station.longitude = c(-73.9762057363987, -73.98808416, 
-73.96119945, -73.96119945, -73.96119945, -73.97782542), bikeid = c(25542, 
21136, 18147, 21211, 26819, 16050), usertype = c("Subscriber", 
"Subscriber", "Customer", "Customer", "Customer", "Subscriber"
), birth.year = c(1965L, 1987L, NA, NA, NA, 2000L), gender = c(2, 
2, 0, 0, 0, 1)), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))

第三个解决方案应该有效-将所有四个值添加到count:D

grouped <-  citibank %>%
count(start.station.id,
end.station.id,
start.station.latitude,
end.station.latitude)

最新更新