r-优化嵌套列表的子集操作



是否可以提高此代码中最后一个子集操作的速度?此代码获取Open Streetmap数据的一小部分,搜索所有有名称的道路,并创建一个仅包含道路的新对象。我对优化代码的最后一位感到困惑:

highway_subset <- subset(muc, ids = highway_subset_ids)

类(muc(

[1]"osmar"列表";

muc是列表的列表,列表中的每个元素都有一个用于创建子集的id。

以下是完整的示例:

library("osmar")
src <- osmsource_api(url = "https://api.openstreetmap.org/api/0.6/")
muc_bbox <- center_bbox(11.575278, 48.137222, 1000, 1000)
muc <- get_osm(muc_bbox, src)
highway_subset_ids <- subset(muc, way_ids = find(muc, way(tags(k == "highway"))))
highway_subset_ids <- find(highway_subset_ids, way(tags(k == "name")))
highway_subset_ids <- find_down(muc, way(highway_subset_ids))
highway_subset <- subset(muc, ids = highway_subset_ids)

事先非常感谢。

更新

如果您在使用ssl时遇到问题,请尝试复制粘贴以下代码示例。这是我能做到的最低限度。

我想优化的线路是这样的:

final_subset<-子集(highway_subset,ids=highway_ssubset_ids(

library("osmar")
highway_subset <-
structure(list(nodes = structure(list(
attrs = structure(
list(
id = numeric(0),
visible = character(0),
timestamp = structure(
list(
sec = numeric(0),
min = integer(0),
hour = integer(0),
mday = integer(0),
mon = integer(0),
year = integer(0),
wday = integer(0),
yday = integer(0),
isdst = integer(0),
zone = character(0),
gmtoff = integer(0)
),
class = c("POSIXlt", "POSIXt")
),
version = numeric(0),
changeset = numeric(0),
user = structure(integer(0), .Label = character(0), class = "factor"),
uid = structure(
integer(0),
.Label = c("2455020", "2590140", "367380"),
class = "factor"
),
lat = numeric(0),
lon = numeric(0)
),
row.names = integer(0),
class = "data.frame"
),
tags = structure(
list(
id = numeric(0),
k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")
),
row.names = integer(0),
class = "data.frame"
)
),
class = c("nodes", "osmar_element", "list")
),
ways = structure(
list(
attrs = structure(
list(
id = c(105071009, 366457476),
visible = c("true", "true"),
timestamp = structure(
list(
sec = c(10, 48),
min = c(54L, 15L),
hour = c(13L, 20L),
mday = c(4L, 15L),
mon = c(2L, 4L),
year = 117:116,
wday = c(6L, 0L),
yday = c(62L, 135L),
isdst = 0:1,
zone = c("CET", "CEST"),
gmtoff = c(NA_integer_, NA_integer_)
),
class = c("POSIXlt", "POSIXt")
),
version = c(15, 5),
changeset = c(46573027, 39338422),
user = structure(
2:1,
.Label = c("bjoern262", "saerdnaer"),
class = "factor"
),
uid = structure(
4:3,
.Label = c("367380",
"64536", "651621", "6998"),
class = "factor"
)
),
row.names = c(2L,
4L),
class = "data.frame"
),
tags = structure(
list(
id = c(
105071009,
105071009,
105071009,
105071009,
105071009,
105071009,
105071009,
105071009,
105071009,
105071009,
105071009,
366457476,
366457476,
366457476,
366457476,
366457476
),
k = structure(
c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 3L, 5L, 6L, 7L, 11L),
.Label = c(
"conveying",
"description",
"highway",
"incline",
"indoor",
"layer",
"level",
"oneway",
"operator",
"ref",
"tunnel"
),
class = "factor"
),
v = structure(
c(6L,
9L, 10L, 4L, 11L, 3L, 2L, 11L, 8L, 7L, 11L, 5L, 11L, 1L, 1L,
11L),
.Label = c(
"-3",
"-3;-4",
"-4",
"down",
"footway",
"forward",
"MP19",
"MVG",
"Rolltreppe MP19",
"steps",
"yes"
),
class = "factor"
)
),
row.names = 4:19,
class = "data.frame"
),
refs = structure(
list(
id = c(105071009, 105071009, 366457476,
366457476, 366457476),
ref = c(3270556979, 1211172719, 3270556979,
3704371485, 3704371444)
),
row.names = c(20L, 21L, 68L, 69L,
70L),
class = "data.frame"
)
),
class = c("ways", "osmar_element",
"list")
),
relations = structure(
list(
attrs = structure(
list(
id = numeric(0),
visible = character(0),
timestamp = structure(
list(
sec = numeric(0),
min = integer(0),
hour = integer(0),
mday = integer(0),
mon = integer(0),
year = integer(0),
wday = integer(0),
yday = integer(0),
isdst = integer(0),
zone = character(0),
gmtoff = integer(0)
),
class = c("POSIXlt", "POSIXt")
),
version = numeric(0),
changeset = numeric(0),
user = structure(integer(0), .Label = character(0), class = "factor"),
uid = structure(
integer(0),
.Label = c(
"137242",
"161619",
"2455020",
"2590140",
"531886",
"72235",
"8748",
"9451067"
),
class = "factor"
)
),
row.names = integer(0),
class = "data.frame"
),
tags = structure(
list(
id = numeric(0),
k = structure(integer(0), .Label = character(0), class = "factor"),
v = structure(integer(0), .Label = character(0), class = "factor")
),
row.names = integer(0),
class = "data.frame"
),
refs = structure(
list(
id = numeric(0),
type = structure(integer(0), .Label = character(0), class = "factor"),
ref = numeric(0),
role = structure(integer(0), .Label = character(0), class = "factor")
),
row.names = integer(0),
class = "data.frame"
)
),
class = c("relations",
"osmar_element", "list")
)
),
class = c("osmar", "list")
)
highway_subset_ids <- find_down(highway_subset, way(highway_subset$ways$attrs$id))
final_subset <- subset(highway_subset, ids = highway_subset_ids)

谢谢!

我介绍了您的代码

library("osmar")
src <- osmsource_api(url = "https://api.openstreetmap.org/api/0.6/")
muc_bbox <- center_bbox(11.575278, 48.137222, 1000, 1000)
muc <- get_osm(muc_bbox, src)
system.time(
highway_subset_ids <- subset(muc, way_ids = find(muc, way(tags(k == "highway"))))
)
# 0.157
system.time(
highway_subset_ids <- find(highway_subset_ids, way(tags(k == "name")))
)
# 0.001
system.time(
highway_subset_ids <- find_down(muc, way(highway_subset_ids))
)
# 0.008
system.time(
highway_subset <- subset(muc, ids = highway_subset_ids)
)
# 0.025

正如你所看到的,对我来说,最后一个subset不是瓶颈,但第一个是(贵6倍(。

内部数据不是很大的

  • nodes15157行
  • ways2938行
  • tags11966行
  • relations350行
  • 另一个tags3270行

您提到您需要多次执行子集。要解决的问题可能是尝试";矢量化";您的代码。我的意思不是明显的lapply,而是提取内部数据帧,对它们进行rbound,然后只做一次子集,如果需要,可以再次拆分它们。CCD_ 8可以用于这里以带来额外的速度。它将比只在15000行上的循环中使用data.table子集更有益,因为在循环中好处要小得多。

为了理解如何";矢量化";您需要了解osmarsubset是如何工作的。如果你看一下源代码,那就没那么难了https://github.com/cran/osmar/blob/master/R/osmar-subsetting.R

  • 尝试从所有对象中取出数据帧到子集
  • rbindlist他们
  • subset使用[.data.table
  • 根据需要拆分
  • 如果需要,转换为原始对象

还请注意,osmar包相当旧,日期为2013年,它与正在积极开发的sp等包有间接依赖关系。您可能会预料到一些与打破osmar依赖关系中在过去7年中可能引入的更改有关的问题。

是的,这可能是可能的。您可以通过在控制台中输入str(muc)来查看osmar对象的结构,还可以通过运行由osmar:::subset_ways等组件函数组成的osmar:::subset.osmar来查看用于执行子设置的代码。所有这些似乎都是以R为基础编写的,并且可以用例如data.table来加速。

策略可能是找到一种更有效的方法来一次性完成这整套操作:

highway_subset_ids <- subset(muc, way_ids = find(muc, way(tags(k == "highway"))))
highway_subset_ids <- find(highway_subset_ids, way(tags(k == "name")))
highway_subset_ids <- find_down(muc, way(highway_subset_ids))
highway_subset <- subset(muc, ids = highway_subset_ids)

你的重点和方法取决于你项目其余部分的细节以及你实际想做的事情

最新更新