我有两个空间点数据集,一个用于原点,另一个用于目的地。
我想从这些坐标中提取最重复出现的轨迹。
> salidas
class : SpatialPointsDataFrame
features : 4385
extent : -8.694846, -8.339238, 41.00827, 41.25749 (xmin, xmax, ymin, ymax)
crs : +init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
variables : 3
names : cod, duracion, franja_h
min values : 1.37263685362e+18, 315, 1
max values : 1.37274729362e+18, 13830, 96
> llegadas
class : SpatialPointsDataFrame
features : 4385
extent : -8.756604, -7.739523, 40.48858, 41.4262 (xmin, xmax, ymin, ymax)
crs : +init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
variables : 3
names : cod, duracion, franja_h
min values : 1.37263685362e+18, 315, 1
max values : 1.37274729362e+18, 13830, 96
我认为这些点应该保持谨慎,因为它们不太具体,也不会提供太多信息,所以我为它制作了一个X和Y坐标网格。
> GridSalidas
X Y Count
1 -8.3375 41.1975 1
2 -8.5125 41.2025 1
3 -8.5325 41.1425 1
4 -8.5325 41.2075 1
5 -8.5325 41.2225 1
6 -8.5475 41.2025 1
7 -8.5475 41.2075 1
8 -8.5475 41.2325 1
9 -8.5525 41.2075 1
10 -8.5525 41.2175 1
> GridLlegadas
X Y Count
1 -7.7375 41.2975 1
2 -7.8625 40.4875 1
3 -8.1475 41.1875 1
4 -8.3075 41.1975 1
5 -8.4725 41.3225 1
6 -8.4875 41.1875 1
7 -8.4925 41.1925 1
8 -8.4975 41.1875 2
9 -8.5025 41.0425 1
10 -8.5025 41.1925 1
因此,我想找出哪些轨迹更常见,这取决于起点和终点。
谢谢!
您所要求的只是一些多维装箱。
我生成了一个随机数据集dt
,用于演示。输出结果是data.table
,它给出了关于最频繁轨迹的以下信息:
- 定义源网格的x-y坐标的下限和上限
- 定义目标网格的x-y坐标的下限和上限
- 计数
library(data.table)
library(magrittr)
N <- 5000
set.seed(123)
gp <- 0.1 #grid precision
# Generate an example dataset -----
{
dt <- data.table(
origin_x = rnorm(N, 1, 0.1),
origin_y = rnorm(N, 2, 0.1),
destination_x = rnorm(N, 11, 0.1),
destination_y = rnorm(N, 12, 0.1)
)
}
# Grid formation ----
{
## Defining the ranges (LL and UL stand for lower and upper limits, respectively) ----
{
origin_x_LL <- dt[, origin_x] %>% min %>% divide_by(gp) %>% floor %>% multiply_by(gp)
origin_x_UL <- dt[, origin_x] %>% max %>% divide_by(gp) %>% ceiling %>% multiply_by(gp)
origin_y_LL <- dt[, origin_y] %>% min %>% divide_by(gp) %>% floor %>% multiply_by(gp)
origin_y_UL <- dt[, origin_y] %>% max %>% divide_by(gp) %>% ceiling %>% multiply_by(gp)
destination_x_LL <- dt[, destination_x] %>% min %>% divide_by(gp) %>% floor %>% multiply_by(gp)
destination_x_UL <- dt[, destination_x] %>% max %>% divide_by(gp) %>% ceiling %>% multiply_by(gp)
destination_y_LL <- dt[, destination_y] %>% min %>% divide_by(gp) %>% floor %>% multiply_by(gp)
destination_y_UL <- dt[, destination_y] %>% max %>% divide_by(gp) %>% ceiling %>% multiply_by(gp)
}
## Forming the breaks for binning ----
{
origin_x_brks <- seq(origin_x_LL, origin_x_UL, by = gp)
origin_y_brks <- seq(origin_y_LL, origin_y_UL, by = gp)
destination_x_brks <- seq(destination_x_LL, destination_x_UL, by = gp)
destination_y_brks <- seq(destination_y_LL, destination_y_UL, by = gp)
}
## Computing the number of bins ----
{
origin_x_Nbin <- length(origin_x_brks) - 1L
origin_y_Nbin <- length(origin_y_brks) - 1L
destination_x_Nbin <- length(destination_x_brks) - 1L
destination_y_Nbin <- length(destination_y_brks) - 1L
}
## Binning ----
{
origin_x_bin <- .bincode(dt[, origin_x], origin_x_brks, include.lowest = T)
origin_y_bin <- .bincode(dt[, origin_y], origin_y_brks, include.lowest = T)
destination_x_bin <- .bincode(dt[, destination_x], destination_x_brks, include.lowest = T)
destination_y_bin <- .bincode(dt[, destination_y], destination_y_brks, include.lowest = T)
}
}
# Counting grid frequency ----
{
grid_count <-
lapply(seq(origin_x_Nbin), function(i) {
lapply(seq(origin_y_Nbin), function(j) {
lapply(seq(destination_x_Nbin), function(m) {
lapply(seq(destination_y_Nbin), function(n) {
this_count = which(origin_x_bin == i & origin_y_bin == j & destination_x_bin == m & destination_y_bin == n) %>% length
return(data.table(origin_x_LL = origin_x_brks[i], origin_x_UL = origin_x_brks[i + 1],
origin_y_LL = origin_y_brks[j], origin_y_UL = origin_y_brks[j + 1],
destination_x_LL = destination_x_brks[m], destination_x_UL = destination_x_brks[m + 1],
destination_y_LL = destination_y_brks[n], destination_y_UL = destination_y_brks[n + 1],
count = this_count))
}) %>% rbindlist
}) %>% rbindlist
}) %>% rbindlist
}) %>% rbindlist
}
# Getting the most frequent grid ----
{
print(grid_count[count == max(count)])
}