r-数据透视更宽的自定义格式



我有以下数据:

ROWID
CIRCUMSTANCE_CODE
CIRCUMSTANCE_ID
CIRCUMSTANCE_TYPE
PERSON_ID
REPORT_NUMBER
CODE
17089   0.00    5324477 Person  00002644-b0bd-4e7d-9538-537467845979    ADJ1390021  0.00
26657   0.00    4863804 Person  0000a06c-de25-4b2d-b8f9-b21acfa5f163    ADG2930017  0.00
12863   0.00    5423146 Person  00030e8a-2de0-471f-ad37-0a4cfb4fee30    ADH321001K  0.00
21534   0.00    4979253 Person  00040afb-47dc-419a-8406-1b02d2573d54    ADH3930007  0.00
2338    0.00    5333458 Person  0004c541-f6e9-4466-9af2-b7b0905b5ee5    ADJ864000Z  0.00
11035   0.00    5108294 Person  0007eafb-8bbb-40c1-adab-57110e28398f    ADH271000R  0.00
21865   11.00   5750342 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  11.00
21866   7.00    5750341 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  7.00
21867   25.00   5750343 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  25.88
21868   28.00   5750344 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  28.00

我想重新塑造它,这样对于每个PERSONID,CIRCURMSTANCE CODE的值都是宽的。例如,ID000de731-540f-48e1-84f4-8048a5b3cfdc

将有一行,有四个新列,每个情况代码一个(11.00,7.00,5.88,28.00(

ROWID
CIRCUMSTANCE_ID
CIRCUMSTANCE_TYPE
PERSON_ID
REPORT_NUMBER
CODE_1
CODE_2
CODE_3
CODE_4
21865   5750342 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  11.00  7.00  25.88  28.00

如果一个personID没有足够的代码来填充CODE_x列,那么NA就可以了。

这里有一个可能的解决方案。但是,请注意,我将第一个ROWIDCIRCUMSTANCE_ID分配给了每个PERSON_ID,因为否则这些行就不包含相同的信息:

data <- read.table(text = "17089   0.00    5324477 Person  00002644-b0bd-4e7d-9538-537467845979    ADJ1390021  0.00
26657   0.00    4863804 Person  0000a06c-de25-4b2d-b8f9-b21acfa5f163    ADG2930017  0.00
12863   0.00    5423146 Person  00030e8a-2de0-471f-ad37-0a4cfb4fee30    ADH321001K  0.00
21534   0.00    4979253 Person  00040afb-47dc-419a-8406-1b02d2573d54    ADH3930007  0.00
2338    0.00    5333458 Person  0004c541-f6e9-4466-9af2-b7b0905b5ee5    ADJ864000Z  0.00
11035   0.00    5108294 Person  0007eafb-8bbb-40c1-adab-57110e28398f    ADH271000R  0.00
21865   11.00   5750342 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  11.00
21866   7.00    5750341 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  7.00
21867   25.00   5750343 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  25.88
21868   28.00   5750344 Person  000de731-540f-48e1-84f4-8048a5b3cfdc    ADJ960000C  28.00")
col_names <- read.table(text = "ROWID
CIRCUMSTANCE_CODE
CIRCUMSTANCE_ID
CIRCUMSTANCE_TYPE
PERSON_ID
REPORT_NUMBER
CODE")
col_names <- col_names$V1
colnames(data) <- col_names
library(tidyr)
library(dplyr)
data %>% 
select(-CIRCUMSTANCE_CODE) %>% 
group_by(PERSON_ID) %>% 
mutate(group_id = seq_len(n()),
ROWID = ROWID[1],
CIRCUMSTANCE_ID = CIRCUMSTANCE_ID[1]) %>% 
ungroup() %>% 
pivot_wider(
names_from = group_id,
values_from = CODE,
names_prefix = "CODE_"
)
# A tibble: 7 x 9
ROWID CIRCUMSTANCE_ID CIRCUMSTANCE_TYPE PERSON_ID                            REPORT_NUMBER CODE_1 CODE_2 CODE_3 CODE_4
<int>           <int> <chr>             <chr>                                <chr>          <dbl>  <dbl>  <dbl>  <dbl>
1 17089         5324477 Person            00002644-b0bd-4e7d-9538-537467845979 ADJ1390021         0     NA   NA       NA
2 26657         4863804 Person            0000a06c-de25-4b2d-b8f9-b21acfa5f163 ADG2930017         0     NA   NA       NA
3 12863         5423146 Person            00030e8a-2de0-471f-ad37-0a4cfb4fee30 ADH321001K         0     NA   NA       NA
4 21534         4979253 Person            00040afb-47dc-419a-8406-1b02d2573d54 ADH3930007         0     NA   NA       NA
5  2338         5333458 Person            0004c541-f6e9-4466-9af2-b7b0905b5ee5 ADJ864000Z         0     NA   NA       NA
6 11035         5108294 Person            0007eafb-8bbb-40c1-adab-57110e28398f ADH271000R         0     NA   NA       NA
7 21865         5750342 Person            000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C        11      7   25.9     28

您可以使用tidyverse中的包执行以下操作。

library(dplyr)
library(tidyr)
df %>%
group_by(PERSON_ID) %>%
mutate(pivotName = str_c('CODE_', row_number())) %>%
pivot_wider(names_from = pivotName, values_from = CIRCUMSTANCE_CODE) %>%
mutate(across(starts_with('CODE_'), ~ max(.x, na.rm = TRUE)),
across(starts_with('CODE_'), ~ na_if(.x, -Inf))) %>%
slice(1)
#   ROWID CIRCUMSTANCE_ID CIRCUMSTANCE_TYPE PERSON_ID                            # REPORT_NUMBER  CODE CODE_1 CODE_2 CODE_3 CODE_4
#   <dbl>           <dbl> <chr>             <chr>                                # <chr>         <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
# 1 17089         5324477 Person            00002644-b0bd-4e7d-9538-537467845979 ADJ1390021        0      0     NA     NA     NA
# 2 26657         4863804 Person            0000a06c-de25-4b2d-b8f9-b21acfa5f163 ADG2930017        0      0     NA     NA     NA
# 3 12863         5423146 Person            00030e8a-2de0-471f-ad37-0a4cfb4fee30 ADH321001K        0      0     NA     NA     NA
# 4 21534         4979253 Person            00040afb-47dc-419a-8406-1b02d2573d54 ADH3930007        0      0     NA     NA     NA
# 5  2338         5333458 Person            0004c541-f6e9-4466-9af2-b7b0905b5ee5 ADJ864000Z        0      0     NA     NA     NA
# 6 11035         5108294 Person            0007eafb-8bbb-40c1-adab-57110e28398f ADH271000R        0      0     NA     NA     NA
# 7 21865         5750342 Person            000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C       11     11      7     25     28

数据

df <- structure(list(ROWID = c(17089, 26657, 12863, 21534, 2338, 11035, 
21865, 21866, 21867, 21868), CIRCUMSTANCE_CODE = c(0, 0, 0, 0, 
0, 0, 11, 7, 25, 28), CIRCUMSTANCE_ID = c(5324477, 4863804, 5423146, 
4979253, 5333458, 5108294, 5750342, 5750341, 5750343, 5750344
), CIRCUMSTANCE_TYPE = c("Person", "Person", "Person", "Person", 
"Person", "Person", "Person", "Person", "Person", "Person"), 
PERSON_ID = c("00002644-b0bd-4e7d-9538-537467845979", "0000a06c-de25-4b2d-b8f9-b21acfa5f163", 
"00030e8a-2de0-471f-ad37-0a4cfb4fee30", "00040afb-47dc-419a-8406-1b02d2573d54", 
"0004c541-f6e9-4466-9af2-b7b0905b5ee5", "0007eafb-8bbb-40c1-adab-57110e28398f", 
"000de731-540f-48e1-84f4-8048a5b3cfdc", "000de731-540f-48e1-84f4-8048a5b3cfdc", 
"000de731-540f-48e1-84f4-8048a5b3cfdc", "000de731-540f-48e1-84f4-8048a5b3cfdc"
), REPORT_NUMBER = c("ADJ1390021", "ADG2930017", "ADH321001K", 
"ADH3930007", "ADJ864000Z", "ADH271000R", "ADJ960000C", "ADJ960000C", 
"ADJ960000C", "ADJ960000C"), CODE = c(0, 0, 0, 0, 0, 0, 11, 
7, 25.88, 28)), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))```

最新更新