我有数据的格式为:
have <- structure(list(V1 = c(4L, 28L, 2L),
V2 = c("[{"group":1,"topic":"A"},{"group":1,"topic":"B"},{"group":2,"topic":"C"},{"group":2,"topic":"T"},{"group":2,"topic":"U"},{"group":3,"topic":"V"},{"group":3,"topic":"D"},{"group":3,"topic":"R"},{"group":4,"topic":"A"},{"group":4,"topic":"Q"},{"group":4,"topic":"S"},{"group":4,"topic":"W"},{"group":6,"topic":"O"},{"group":6,"topic":"P"},{"group":6,"topic":"E"},{"group":6,"topic":"F"},{"group":6,"topic":"G"},{"group":6,"topic":"H"},{"group":6,"topic":"I"},{"group":6,"topic":"J"},{"group":6,"topic":"K"},{"group":6,"topic":"L"},{"group":6,"topic":"M"},{"group":6,"topic":"N"}]",
"[]",
"[{"group":2,"topic":"C"},{"group":3,"topic":"D"},{"group":6,"topic":"O"},{"group":6,"topic":"P"},{"group":6,"topic":"E"},{"group":6,"topic":"G"},{"group":6,"topic":"M"}]")
),
row.names = c(NA, 3L),
class = "data.frame")
V2
的内容是每一行的嵌套分组,如[{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]
我想得到一个宽的数据框,有一个指标(1/0)为组+主题(见also_have
)的每一行的每个组合。像这样:
# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl> <dbl> <dbl> <dbl>
1 4 1 1 0
2 28 0 0 0
3 2 0 0 0
第一步是解析json.
我可以使用purrr::map(have$V2, jsonlite::fromJSON)
将其解嵌到列表中,但是我不确定如何将V1
列(我们可能将其重命名为id
)绑定到结果列表的每个元素(注意列表元素2是空的,因为V1==28
是空的)。下面是第一个元素的代码片段,其中id
(V1
)添加了。
[[1]]
group topic id
1 1 A 4
2 1 B 4
3 2 C 4
4 2 T 4
...
或者,我认为purrr::map_df(have$V2, jsonlite::fromJSON)
会让我更接近我最终需要的,但在这里我也不确定如何添加行id
(V1
)。
df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df
What I get:
group topic
1 1 A
2 1 B
3 2 C
4 2 T
...
What I want (notice `V1==28` does not appear):
group topic id
1 1 A 4
2 1 B 4
3 2 C 4
4 2 T 4
5 2 U 4
6 3 V 4
7 3 D 4
8 3 R 4
9 4 A 4
10 4 Q 4
11 4 S 4
12 4 W 4
13 6 O 4
14 6 P 4
15 6 E 4
16 6 F 4
17 6 G 4
18 6 H 4
19 6 I 4
20 6 J 4
21 6 K 4
22 6 L 4
23 6 M 4
24 6 N 4
25 2 C 2
26 3 D 2
27 6 O 2
28 6 P 2
29 6 E 2
30 6 G 2
31 6 M 2
。
我想如果我能得到上面的数据框与id
我可以得到剩下的方式。最终目标是将此信息与also_have
连接起来,然后进行宽枢轴。
# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
mutate(topic_id = 1:n()) %>%
magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
select(topic_id, group, topic)
# pivot wide
# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl> <dbl> <dbl> <dbl>
1 4 1 1 0
2 28 0 0 0
3 2 0 0 0
更新:
应用@akrun的解决方案:
purrr::map_dfr(setNames(have$V2, have$V1),
jsonlite::fromJSON,
.id = 'V1') %>%
rename(id = V1) %>%
left_join(also_have, by=c("group", "topic")) %>%
select(-group, -topic) %>%
mutate(value = 1) %>%
pivot_wider(id_cols = id,
names_from = topic_id,
names_prefix = "topic_id",
values_from = value,
values_fill = 0
) %>%
full_join(tibble(id = as.character(have$V1))) %>%
replace(is.na(.), 0)
# A tibble: 3 x 25
id topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4 1 1 1 1 1 1 1
2 2 0 0 1 0 0 0 1
3 28 0 0 0 0 0 0 0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
# topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
# topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
# topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
# topic_id143 <dbl>, topic_id144 <dbl>
我们可以传递一个命名向量,然后在map_dfr
中使用.id
purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')
与产出
id group topic
1 4 1 A
2 4 1 B
3 4 2 C
4 4 2 T
5 4 2 U
6 4 3 V
7 4 3 D
8 4 3 R
9 4 4 A
10 4 4 Q
11 4 4 S
12 4 4 W
...
或者在使用rowwise
后在dplyr
框架内完成
library(tidyr)
have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>%
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>%
select(-V2)
# A tibble: 32 x 3
ID group topic
<int> <int> <chr>
1 4 1 A
2 4 1 B
3 4 2 C
4 4 2 T
5 4 2 U
6 4 3 V
7 4 3 D
8 4 3 R
9 4 4 A
10 4 4 Q
# … with 22 more rows
第二步执行join
out <- have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>%
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>%
select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
ID group topic topic_id
<int> <int> <chr> <int>
1 4 1 A 1
2 4 1 B 2
3 4 2 C 29
4 4 2 T 46
5 4 2 U 47
6 4 3 V 74
7 4 3 D 56
8 4 3 R 70
9 4 4 A 79
10 4 4 Q 95
# … with 153 more rows