我有以下数据:
ROWID
CIRCUMSTANCE_CODE
CIRCUMSTANCE_ID
CIRCUMSTANCE_TYPE
PERSON_ID
REPORT_NUMBER
CODE
17089 0.00 5324477 Person 00002644-b0bd-4e7d-9538-537467845979 ADJ1390021 0.00
26657 0.00 4863804 Person 0000a06c-de25-4b2d-b8f9-b21acfa5f163 ADG2930017 0.00
12863 0.00 5423146 Person 00030e8a-2de0-471f-ad37-0a4cfb4fee30 ADH321001K 0.00
21534 0.00 4979253 Person 00040afb-47dc-419a-8406-1b02d2573d54 ADH3930007 0.00
2338 0.00 5333458 Person 0004c541-f6e9-4466-9af2-b7b0905b5ee5 ADJ864000Z 0.00
11035 0.00 5108294 Person 0007eafb-8bbb-40c1-adab-57110e28398f ADH271000R 0.00
21865 11.00 5750342 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 11.00
21866 7.00 5750341 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 7.00
21867 25.00 5750343 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 25.88
21868 28.00 5750344 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 28.00
我想重新塑造它,这样对于每个PERSONID,CIRCURMSTANCE CODE的值都是宽的。例如,ID000de731-540f-48e1-84f4-8048a5b3cfdc
将有一行,有四个新列,每个情况代码一个(11.00,7.00,5.88,28.00(
ROWID
CIRCUMSTANCE_ID
CIRCUMSTANCE_TYPE
PERSON_ID
REPORT_NUMBER
CODE_1
CODE_2
CODE_3
CODE_4
21865 5750342 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 11.00 7.00 25.88 28.00
如果一个personID没有足够的代码来填充CODE_x列,那么NA就可以了。
这里有一个可能的解决方案。但是,请注意,我将第一个ROWID
和CIRCUMSTANCE_ID
分配给了每个PERSON_ID
,因为否则这些行就不包含相同的信息:
data <- read.table(text = "17089 0.00 5324477 Person 00002644-b0bd-4e7d-9538-537467845979 ADJ1390021 0.00
26657 0.00 4863804 Person 0000a06c-de25-4b2d-b8f9-b21acfa5f163 ADG2930017 0.00
12863 0.00 5423146 Person 00030e8a-2de0-471f-ad37-0a4cfb4fee30 ADH321001K 0.00
21534 0.00 4979253 Person 00040afb-47dc-419a-8406-1b02d2573d54 ADH3930007 0.00
2338 0.00 5333458 Person 0004c541-f6e9-4466-9af2-b7b0905b5ee5 ADJ864000Z 0.00
11035 0.00 5108294 Person 0007eafb-8bbb-40c1-adab-57110e28398f ADH271000R 0.00
21865 11.00 5750342 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 11.00
21866 7.00 5750341 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 7.00
21867 25.00 5750343 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 25.88
21868 28.00 5750344 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 28.00")
col_names <- read.table(text = "ROWID
CIRCUMSTANCE_CODE
CIRCUMSTANCE_ID
CIRCUMSTANCE_TYPE
PERSON_ID
REPORT_NUMBER
CODE")
col_names <- col_names$V1
colnames(data) <- col_names
library(tidyr)
library(dplyr)
data %>%
select(-CIRCUMSTANCE_CODE) %>%
group_by(PERSON_ID) %>%
mutate(group_id = seq_len(n()),
ROWID = ROWID[1],
CIRCUMSTANCE_ID = CIRCUMSTANCE_ID[1]) %>%
ungroup() %>%
pivot_wider(
names_from = group_id,
values_from = CODE,
names_prefix = "CODE_"
)
# A tibble: 7 x 9
ROWID CIRCUMSTANCE_ID CIRCUMSTANCE_TYPE PERSON_ID REPORT_NUMBER CODE_1 CODE_2 CODE_3 CODE_4
<int> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 17089 5324477 Person 00002644-b0bd-4e7d-9538-537467845979 ADJ1390021 0 NA NA NA
2 26657 4863804 Person 0000a06c-de25-4b2d-b8f9-b21acfa5f163 ADG2930017 0 NA NA NA
3 12863 5423146 Person 00030e8a-2de0-471f-ad37-0a4cfb4fee30 ADH321001K 0 NA NA NA
4 21534 4979253 Person 00040afb-47dc-419a-8406-1b02d2573d54 ADH3930007 0 NA NA NA
5 2338 5333458 Person 0004c541-f6e9-4466-9af2-b7b0905b5ee5 ADJ864000Z 0 NA NA NA
6 11035 5108294 Person 0007eafb-8bbb-40c1-adab-57110e28398f ADH271000R 0 NA NA NA
7 21865 5750342 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 11 7 25.9 28
您可以使用tidyverse中的包执行以下操作。
library(dplyr)
library(tidyr)
df %>%
group_by(PERSON_ID) %>%
mutate(pivotName = str_c('CODE_', row_number())) %>%
pivot_wider(names_from = pivotName, values_from = CIRCUMSTANCE_CODE) %>%
mutate(across(starts_with('CODE_'), ~ max(.x, na.rm = TRUE)),
across(starts_with('CODE_'), ~ na_if(.x, -Inf))) %>%
slice(1)
# ROWID CIRCUMSTANCE_ID CIRCUMSTANCE_TYPE PERSON_ID # REPORT_NUMBER CODE CODE_1 CODE_2 CODE_3 CODE_4
# <dbl> <dbl> <chr> <chr> # <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 17089 5324477 Person 00002644-b0bd-4e7d-9538-537467845979 ADJ1390021 0 0 NA NA NA
# 2 26657 4863804 Person 0000a06c-de25-4b2d-b8f9-b21acfa5f163 ADG2930017 0 0 NA NA NA
# 3 12863 5423146 Person 00030e8a-2de0-471f-ad37-0a4cfb4fee30 ADH321001K 0 0 NA NA NA
# 4 21534 4979253 Person 00040afb-47dc-419a-8406-1b02d2573d54 ADH3930007 0 0 NA NA NA
# 5 2338 5333458 Person 0004c541-f6e9-4466-9af2-b7b0905b5ee5 ADJ864000Z 0 0 NA NA NA
# 6 11035 5108294 Person 0007eafb-8bbb-40c1-adab-57110e28398f ADH271000R 0 0 NA NA NA
# 7 21865 5750342 Person 000de731-540f-48e1-84f4-8048a5b3cfdc ADJ960000C 11 11 7 25 28
数据
df <- structure(list(ROWID = c(17089, 26657, 12863, 21534, 2338, 11035,
21865, 21866, 21867, 21868), CIRCUMSTANCE_CODE = c(0, 0, 0, 0,
0, 0, 11, 7, 25, 28), CIRCUMSTANCE_ID = c(5324477, 4863804, 5423146,
4979253, 5333458, 5108294, 5750342, 5750341, 5750343, 5750344
), CIRCUMSTANCE_TYPE = c("Person", "Person", "Person", "Person",
"Person", "Person", "Person", "Person", "Person", "Person"),
PERSON_ID = c("00002644-b0bd-4e7d-9538-537467845979", "0000a06c-de25-4b2d-b8f9-b21acfa5f163",
"00030e8a-2de0-471f-ad37-0a4cfb4fee30", "00040afb-47dc-419a-8406-1b02d2573d54",
"0004c541-f6e9-4466-9af2-b7b0905b5ee5", "0007eafb-8bbb-40c1-adab-57110e28398f",
"000de731-540f-48e1-84f4-8048a5b3cfdc", "000de731-540f-48e1-84f4-8048a5b3cfdc",
"000de731-540f-48e1-84f4-8048a5b3cfdc", "000de731-540f-48e1-84f4-8048a5b3cfdc"
), REPORT_NUMBER = c("ADJ1390021", "ADG2930017", "ADH321001K",
"ADH3930007", "ADJ864000Z", "ADH271000R", "ADJ960000C", "ADJ960000C",
"ADJ960000C", "ADJ960000C"), CODE = c(0, 0, 0, 0, 0, 0, 11,
7, 25.88, 28)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))```