r语言 - 解析json的一列并与其他列绑定以生成数据框架



我有数据的格式为:

have <- structure(list(V1 = c(4L, 28L, 2L), 
V2 = c("[{"group":1,"topic":"A"},{"group":1,"topic":"B"},{"group":2,"topic":"C"},{"group":2,"topic":"T"},{"group":2,"topic":"U"},{"group":3,"topic":"V"},{"group":3,"topic":"D"},{"group":3,"topic":"R"},{"group":4,"topic":"A"},{"group":4,"topic":"Q"},{"group":4,"topic":"S"},{"group":4,"topic":"W"},{"group":6,"topic":"O"},{"group":6,"topic":"P"},{"group":6,"topic":"E"},{"group":6,"topic":"F"},{"group":6,"topic":"G"},{"group":6,"topic":"H"},{"group":6,"topic":"I"},{"group":6,"topic":"J"},{"group":6,"topic":"K"},{"group":6,"topic":"L"},{"group":6,"topic":"M"},{"group":6,"topic":"N"}]",
"[]", 
"[{"group":2,"topic":"C"},{"group":3,"topic":"D"},{"group":6,"topic":"O"},{"group":6,"topic":"P"},{"group":6,"topic":"E"},{"group":6,"topic":"G"},{"group":6,"topic":"M"}]")
), 
row.names = c(NA, 3L), 
class = "data.frame")

V2的内容是每一行的嵌套分组,如[{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]

我想得到一个宽的数据框,有一个指标(1/0)为组+主题(见also_have)的每一行的每个组合。像这样:

# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

第一步是解析json.

我可以使用purrr::map(have$V2, jsonlite::fromJSON)将其解嵌到列表中,但是我不确定如何将V1列(我们可能将其重命名为id)绑定到结果列表的每个元素(注意列表元素2是空的,因为V1==28是空的)。下面是第一个元素的代码片段,其中id(V1)添加了。

[[1]]
group topic  id
1      1     A   4
2      1     B   4
3      2     C   4
4      2     T   4
...

或者,我认为purrr::map_df(have$V2, jsonlite::fromJSON)会让我更接近我最终需要的,但在这里我也不确定如何添加行id(V1)。

df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df
What I get:
group topic
1      1     A
2      1     B
3      2     C
4      2     T
...
What I want (notice `V1==28` does not appear): 
group topic id
1      1     A  4
2      1     B  4
3      2     C  4
4      2     T  4
5      2     U  4
6      3     V  4
7      3     D  4
8      3     R  4
9      4     A  4
10     4     Q  4
11     4     S  4
12     4     W  4
13     6     O  4
14     6     P  4
15     6     E  4
16     6     F  4
17     6     G  4
18     6     H  4
19     6     I  4
20     6     J  4
21     6     K  4
22     6     L  4
23     6     M  4
24     6     N  4
25     2     C  2
26     3     D  2
27     6     O  2
28     6     P  2
29     6     E  2
30     6     G  2
31     6     M  2

我想如果我能得到上面的数据框与id我可以得到剩下的方式。最终目标是将此信息与also_have连接起来,然后进行宽枢轴。

# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
mutate(topic_id = 1:n()) %>%
magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
select(topic_id, group, topic)
# pivot wide
# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

更新:

应用@akrun的解决方案:

purrr::map_dfr(setNames(have$V2, have$V1), 
jsonlite::fromJSON, 
.id = 'V1') %>%
rename(id = V1) %>%
left_join(also_have, by=c("group", "topic")) %>%
select(-group, -topic) %>%
mutate(value = 1) %>%
pivot_wider(id_cols = id, 
names_from = topic_id,
names_prefix = "topic_id",
values_from = value,
values_fill = 0
) %>% 
full_join(tibble(id = as.character(have$V1))) %>%
replace(is.na(.), 0)
# A tibble: 3 x 25
id    topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
<chr>     <dbl>     <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
1 4             1         1          1          1          1          1          1
2 2             0         0          1          0          0          0          1
3 28            0         0          0          0          0          0          0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
#   topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
#   topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
#   topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
#   topic_id143 <dbl>, topic_id144 <dbl>

我们可以传递一个命名向量,然后在map_dfr中使用.id

purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')

与产出

id group topic
1   4     1     A
2   4     1     B
3   4     2     C
4   4     2     T
5   4     2     U
6   4     3     V
7   4     3     D
8   4     3     R
9   4     4     A
10  4     4     Q
11  4     4     S
12  4     4     W
...

或者在使用rowwise后在dplyr框架内完成

library(tidyr)
have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>% 
select(-V2)
# A tibble: 32 x 3
ID group topic
<int> <int> <chr>
1     4     1 A    
2     4     1 B    
3     4     2 C    
4     4     2 T    
5     4     2 U    
6     4     3 V    
7     4     3 D    
8     4     3 R    
9     4     4 A    
10     4     4 Q    
# … with 22 more rows

第二步执行join

out <-  have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>% 
select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
ID group topic topic_id
<int> <int> <chr>    <int>
1     4     1 A            1
2     4     1 B            2
3     4     2 C           29
4     4     2 T           46
5     4     2 U           47
6     4     3 V           74
7     4     3 D           56
8     4     3 R           70
9     4     4 A           79
10     4     4 Q           95
# … with 153 more rows

相关内容

  • 没有找到相关文章

最新更新