我有一个数据帧,每一行都是唯一的观察结果。
如果观测结果位于指定的时间距离内(例如30天(,则观测结果在时间上重叠。
如果观测结果位于指定的空间距离(例如20公里(内,则它们在空间中重叠。
我正在收集在时间和空间上重叠的观察结果。我想制作一个列(重叠(,其中包含向量,这些向量的观测值与观测值重叠。我已经尝试了下面的解决方案,但运行时间太差,该解决方案不适用。
library(dplyr)
library(lubridate)
library(purrr)
library(geosphere)
spat_proximity <- function(x, y, z) {
return(which(map_dbl(y, ~ distGeo(., x)) <= z))}
temp_proximity <- function(x, y, z) {
return(which(map_dbl(y, ~ abs(x - .)) <= z))}
test %>%
mutate(overlaps = map2(map(place, ~ spat_proximity(., place, 20000)),
map(time, ~ temp_proximity(., time, 30)),
~ intersect(.x, .y)))
关于如何加快速度的想法将不胜感激。
所需输出
structure(list(id = 1:42, time = structure(c(1478601762, 1475170279,
1469770219, 1462441336, 1474739469, 1488216507, 1475203721, 1468705558,
1481722718, 1485897197, 1488669576, 1501288618, 1510266595, 1516828588,
1497048175, 1516546144, 1507576242, 1517654363, 1496070298, 1519765220,
1507408104, 1532046710, 1542196446, 1534747170, 1533605231, 1521381844,
1545389880, 1537988628, 1544304998, 1524842149, 1551051077, 1540822870,
1579775599, 1580337175, 1551486497, 1554879837, 1568620434, 1568701543,
1556387550, 1561253396, 1582925482, 1562166384), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), place = list(c(7.59729413351368, 52.6052275122351
), c(9.99728447956781, 53.43773657253), c(10.1114473929533, 53.1295890148866
), c(7.74115218835801, 53.555354690339), c(9.82895066827581,
53.1009319396015), c(10.061107415855, 53.1908752763309), c(10.1134381934544,
53.1450558612239), c(8.59001735546083, 53.1767797285482), c(6.43939168487555,
52.5520931654252), c(8.38811111096636, 53.9043055557574), c(6.20061916537948,
52.462037409576), c(8.66656282486832, 52.8269702466929), c(9.92127490588442,
53.1240045666796), c(9.77810957468704, 53.1445777603789), c(10.0972382106036,
53.1604265989175), c(10.0473952445094, 53.1698097395641), c(9.23773401919961,
53.2120381900218), c(8.29524237837988, 52.822332696399), c(6.63690696797941,
53.4436726627048), c(6.89839325296288, 53.947454203445), c(6.97064542834721,
54.2487197094445), c(9.98865072631714, 53.4088944299342), c(9.94164401569524,
53.1500576073959), c(9.64242996587752, 52.9285470044703), c(10.1026940185685,
53.1635394335485), c(9.94874529044194, 53.2202512735354), c(8.8025526552284,
53.2423093779114), c(7.93352467761445, 52.9129105531343), c(6.6418846001424,
53.2459031608081), c(7.56102465003101, 53.5306444680171), c(7.36619114998468,
53.748869508885), c(7.40993284414052, 54.5367797663042), c(9.90022663895919,
53.3726361099083), c(9.41110555596208, 52.5001044709056), c(10.1151193231519,
53.1539029361817), c(10.1064400828529, 53.1793449776572), c(9.94235711256256,
53.2622041055899), c(9.44215997717822, 53.4799339987572), c(7.03832846889284,
53.1986115213435), c(7.32755360272354, 53.416700338513), c(7.57828611098173,
53.6107027769073), c(7.55411005022882, 54.1905803935834)), overlaps = list(
1L, 2L, 3L, 4L, c(5L, 7L), 6L, c(5L, 7L), 8L, 9L, 10L, 11L,
12L, 13L, c(14L, 16L), 15L, c(14L, 16L), 17L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L)), row.names = c(NA,
-42L), class = c("tbl_df", "tbl", "data.frame"))
数据
structure(list(id = 1:42, time = structure(c(1478601762, 1475170279,
1469770219, 1462441336, 1474739469, 1488216507, 1475203721, 1468705558,
1481722718, 1485897197, 1488669576, 1501288618, 1510266595, 1516828588,
1497048175, 1516546144, 1507576242, 1517654363, 1496070298, 1519765220,
1507408104, 1532046710, 1542196446, 1534747170, 1533605231, 1521381844,
1545389880, 1537988628, 1544304998, 1524842149, 1551051077, 1540822870,
1579775599, 1580337175, 1551486497, 1554879837, 1568620434, 1568701543,
1556387550, 1561253396, 1582925482, 1562166384), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), place = list(c(7.59729413351368, 52.6052275122351
), c(9.99728447956781, 53.43773657253), c(10.1114473929533, 53.1295890148866
), c(7.74115218835801, 53.555354690339), c(9.82895066827581,
53.1009319396015), c(10.061107415855, 53.1908752763309), c(10.1134381934544,
53.1450558612239), c(8.59001735546083, 53.1767797285482), c(6.43939168487555,
52.5520931654252), c(8.38811111096636, 53.9043055557574), c(6.20061916537948,
52.462037409576), c(8.66656282486832, 52.8269702466929), c(9.92127490588442,
53.1240045666796), c(9.77810957468704, 53.1445777603789), c(10.0972382106036,
53.1604265989175), c(10.0473952445094, 53.1698097395641), c(9.23773401919961,
53.2120381900218), c(8.29524237837988, 52.822332696399), c(6.63690696797941,
53.4436726627048), c(6.89839325296288, 53.947454203445), c(6.97064542834721,
54.2487197094445), c(9.98865072631714, 53.4088944299342), c(9.94164401569524,
53.1500576073959), c(9.64242996587752, 52.9285470044703), c(10.1026940185685,
53.1635394335485), c(9.94874529044194, 53.2202512735354), c(8.8025526552284,
53.2423093779114), c(7.93352467761445, 52.9129105531343), c(6.6418846001424,
53.2459031608081), c(7.56102465003101, 53.5306444680171), c(7.36619114998468,
53.748869508885), c(7.40993284414052, 54.5367797663042), c(9.90022663895919,
53.3726361099083), c(9.41110555596208, 52.5001044709056), c(10.1151193231519,
53.1539029361817), c(10.1064400828529, 53.1793449776572), c(9.94235711256256,
53.2622041055899), c(9.44215997717822, 53.4799339987572), c(7.03832846889284,
53.1986115213435), c(7.32755360272354, 53.416700338513), c(7.57828611098173,
53.6107027769073), c(7.55411005022882, 54.1905803935834))), row.names = c(NA,
-42L), class = c("tbl_df", "tbl", "data.frame"))
如果你真的想要速度,你可以编写自己的C++代码来计算距离(因为geosphere
很慢(和时间比较
示例
将此代码保存在文件中,例如"~/Desktop/find_overlaps.cpp"
您需要安装Rcpp
-install.packages("Rcpp")
#include "Rcpp.h"
#include <math.h>
static const double earth = 6378137.0; // WSG-84 definition
// haversine formula taken from the geodist library
// - https://github.com/hypertidy/geodist
double distance_haversine(double x1, double y1, double x2, double y2) {
double cosy1 = cos( y1 * M_PI / 180.0 );
double cosy2 = cos( y2 * M_PI / 180.0 );
double sxd = sin ((x2 - x1) * M_PI / 360.0);
double syd = sin ((y2 - y1) * M_PI / 360.0);
double d = syd * syd + cosy1 * cosy2 * sxd * sxd;
d = 2.0 * earth * asin (sqrt (d));
return (d);
}
// returns true if second date is within 30 days of the first
bool within_days( int first_date, int second_date ) {
int days = 30 * 24 * 60 * 60;
int lower_bound = first_date - days;
int upper_bound = first_date + days;
return lower_bound <= second_date && second_date <= upper_bound;
}
bool within_distance( Rcpp::NumericVector start_place, Rcpp::NumericVector end_place, double distance_limit = 20000.0 ) {
double x1 = start_place[0];
double y1 = start_place[1];
double x2 = end_place[0];
double y2 = end_place[1];
return distance_haversine(x1, y1, x2, y2) <= distance_limit;
}
// [[Rcpp::export]]
SEXP find_overlaps( Rcpp::NumericVector ids, Rcpp::IntegerVector dates, Rcpp::List place ) {
R_xlen_t n = dates.length();
R_xlen_t i, j;
Rcpp::List res( n );
R_xlen_t result_counter;
for( i = 0; i < n; ++i ) {
Rcpp::IntegerVector overlaps( n ); // initialise vector to store results
result_counter = 0;
for( j = 0; j < n; ++j ) {
// ignore self-comparisons
if( i != j ) {
int first_date = dates[ i ];
int second_date = dates[ j ];
Rcpp::NumericVector first_place = place[ i ];
Rcpp::NumericVector second_place = place[ j ];
// check the place values exist
if( first_place.length() != 2 || second_place.length() != 2 ) {
continue;
}
if( within_days( first_date, second_date) && within_distance( first_place, second_place ) ) {
overlaps[ result_counter ] = j;
result_counter++;
}
}
if( result_counter > 0 ) {
Rcpp::IntegerVector id_idx = overlaps[ Rcpp::Range( 0, result_counter - 1 ) ];
res[ i ] = ids[ id_idx ];
}
}
}
return res;
}
然后在R 中获取
library(Rcpp)
Rcpp::sourceCpp(file = "~/Desktop/find_overlaps.cpp")
res <- find_overlaps( df$id, df$time, df$place )
df$overlaps <- res
df
# id time place overlaps
# 1 1 2016-11-08 10:42:42 7.597294, 52.605228 NULL
# 2 2 2016-09-29 17:31:19 9.997284, 53.437737 NULL
# 3 3 2016-07-29 05:30:19 10.11145, 53.12959 NULL
# 4 4 2016-05-05 09:42:16 7.741152, 53.555355 NULL
# 5 5 2016-09-24 17:51:09 9.828951, 53.100932 7
# 6 6 2017-02-27 17:28:27 10.06111, 53.19088 NULL
# 7 7 2016-09-30 02:48:41 10.11344, 53.14506 5
# 8 8 2016-07-16 21:45:58 8.590017, 53.176780 NULL
# 9 9 2016-12-14 13:38:38 6.439392, 52.552093 NULL
# 10 10 2017-01-31 21:13:17 8.388111, 53.904306 NULL
# 11 11 2017-03-04 23:19:36 6.200619, 52.462037 NULL
# 12 12 2017-07-29 00:36:58 8.666563, 52.826970 NULL
# 13 13 2017-11-09 22:29:55 9.921275, 53.124005 NULL
# 14 14 2018-01-24 21:16:28 9.77811, 53.14458 16
# 15 15 2017-06-09 22:42:55 10.09724, 53.16043 NULL
# 16 16 2018-01-21 14:49:04 10.04740, 53.16981 14
# 17 17 2017-10-09 19:10:42 9.237734, 53.212038 NULL
# 18 18 2018-02-03 10:39:23 8.295242, 52.822333 NULL
# 19 19 2017-05-29 15:04:58 6.636907, 53.443673 NULL
# 20 20 2018-02-27 21:00:20 6.898393, 53.947454 NULL
# ...
备注
- 我运行了一个快速基准测试,它在微秒内运行在您的示例上
- 我故意忽略了自重叠(因此
NULL
值(
使用spatialrisk
和dplyr
的可能解决方案。spatialrisk
中的关键函数是用C++(Rcpp(编写的,因此速度非常快。
spatialrisk::points_in_circle()
计算从中心点算起的半径范围内的观测值。请注意,距离是使用Haversine公式计算的。
library(spatialrisk)
library(dplyr)
# Find rows within time period
overlap_time_fn <- function(df, days, time_ref){
seconds <- days * 24 * 3600
df %>%
filter(time < time_ref + seconds, time > time_ref - seconds)
}
# Define radius in meters
dist <- 20000
# Define number of days
days <- 30
df %>%
mutate(lon = place[[1]][1]) %>% # add lon column
mutate(lat = place[[1]][2]) %>% # add lat column
rowwise() %>%
mutate(overlap_spat = toString(points_in_circle(., lon_center = lon,
lat_center = lat,
radius = dist)$id)) %>%
mutate(overlap_time = toString(overlap_time_fn(., days = days,
time_ref = time)$id))
data.table
:的可能解决方案
- 使用非等联接来减少空间和时间上匹配对的数量
- 仅计算所选配对的距离和时间差
- 保持结果与预期标准完全对应
- 按
id
分组
在示例数据集中,性能比较显示了x20
的改进,请参见下文。
library(dplyr)
library(purrr)
library(geosphere)
library(data.table)
maxtime <- 30
maxdist <- 20000
setDT(test)
microbenchmark::microbenchmark( datatable = {# Calculate box around each point
test[,c('x','y','boxxmin','boxxmax','boxymin','boxymax','timemin','timemax'):=.(
sapply(place,function(x) x[1]),
sapply(place,function(x) x[2]),
sapply(place,function(x) x[1] - maxdist),
sapply(place,function(x) x[1] + maxdist),
sapply(place,function(x) x[2] - maxdist),
sapply(place,function(x) x[2] + maxdist),
sapply(time,function(t) t - maxtime * 86400),
sapply(time,function(t) t + maxtime * 86400))]
# Select near enough places
result <- test[test,.(place,i.id,i.time,i.place,i.x,i.y,id,x,y,time, place),on=.(x>=boxxmin, y>=boxymin, x <= boxxmax, y<= boxymax , time >= timemin, time <= timemax)]
# Calculate spatio-temporal distances
result[,c('dist','dt'):=.(distGeo(matrix(unlist(i.place),ncol=2,byrow=T),matrix(unlist(place),ncol=2,byrow=T)),
abs(as.numeric(difftime(i.time,time,unit = 'days')))) ]
# Calculate overlaps
overlaps <- result[dt < 30 & dist < 20000 ][,.(overlap = list(i.id)), by = .(id)]
test[overlaps,.(id,time,place,overlap),on=.(id=id)]
id time place overlap
1: 1 2016-11-08 10:42:42 7.597294,52.605228 1
2: 2 2016-09-29 17:31:19 9.997284,53.437737 2
3: 3 2016-07-29 05:30:19 10.11145,53.12959 3
4: 4 2016-05-05 09:42:16 7.741152,53.555355 4
5: 5 2016-09-24 17:51:09 9.828951,53.100932 5,7
6: 7 2016-09-30 02:48:41 10.11344,53.14506 5,7
7: 6 2017-02-27 17:28:27 10.06111,53.19088 6
8: 8 2016-07-16 21:45:58 8.590017,53.176780 8
9: 9 2016-12-14 13:38:38 6.439392,52.552093 9
10: 10 2017-01-31 21:13:17 8.388111,53.904306 10
11: 11 2017-03-04 23:19:36 6.200619,52.462037 11
12: 12 2017-07-29 00:36:58 8.666563,52.826970 12
13: 13 2017-11-09 22:29:55 9.921275,53.124005 13
14: 14 2018-01-24 21:16:28 9.77811,53.14458 14,16
15: 16 2018-01-21 14:49:04 10.04740,53.16981 14,16
16: 15 2017-06-09 22:42:55 10.09724,53.16043 15
17: 17 2017-10-09 19:10:42 9.237734,53.212038 17
...
性能比较:
Unit: milliseconds
expr min lq mean median uq max neval cld
datatable 9.0788 10.21605 11.75099 10.7112 11.7880 24.6698 100 a
purrr 176.7696 199.85610 209.79916 209.2215 218.3165 253.2139 100 b
尝试lapply
函数,大多数时候它更高效,有时用~
定义函数不是一个好主意。试试这个:
p <- do.call(rbind,data$place)
t <- data$time
y <- data %>%
mutate(overlaps = map2(lapply(place,function(x)which(distGeo(x,p2=p)<=20000)),
map(time,function(x)which(abs(as.numeric(difftime(x,t,units = "days")))<=30)),
intersect))