r-如何使用tidyr工具整理大型数据帧



目前,我有这么宽的数据帧。

> dput(head(data))
structure(list(Host.H = c("Human", "Human", "Human", "Human", 
"Human", "Human"), Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA", 
">H3-USA", ">H3-USA", ">H3-USA"), Start = c(1L, 121L, 161L, 401L, 
721L, 1081L), End = c(160L, 240L, 280L, 520L, 1040L, 1280L), 
Strand.H = c("Forward", "Forward", "Forward", "Forward", 
"Forward", "Forward"), P.H = c(0.995962, 0.985782, 0.997249, 
0.983122, 0.998574, 0.993674), Locus_ID.H = c("id "locus1"", 
"id "locus5"", "id "locus7"", "id "locus8"", "id "locus10"", 
"id "locus12""), Host.I = c(NA, "Intermediate", NA, NA, 
NA, NA), Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA), 
Strand.I = c(NA, "Forward", NA, NA, NA, NA), P.I = c(NA, 
0.988441, NA, NA, NA, NA), Locus_ID.I = c(NA, "id "locus5"", 
NA, NA, NA, NA), Host.B = c(NA, "Bat", "Bat", "Bat", "Bat", 
NA), Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", 
">B2-RS-CHA", NA), Strand.B = c(NA, "Forward", "Forward", 
"Forward", "Forward", NA), P.B = c(NA, 0.987457, 0.997273, 
0.975433, 0.998187, NA), Locus_ID.B = c(NA, "id "locus7"", 
"id "locus9"", "id "locus10"", "id "locus11"", NA), 
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus", 
NA), Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", 
">I3-MM-CHA", NA), Strand.C = c(NA, "Forward", "Forward", 
"Forward", "Forward", NA), P.C = c(NA, 0.98647, 0.997287, 
0.981532, 0.998712, NA), Locus_ID.C = c(NA, "id "locus7"", 
"id "locus9"", "id "locus10"", "id "locus12"", NA), 
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared", 
"Unique")), row.names = c(NA, 6L), class = "data.frame")

我一直在寻找整理这些数据的方法。要执行此操作,我需要将后缀为(.H、.I、.B和.C(的所有列按如下方式分组到每列中(Host、Seq_ID、Start、End、Strand、P、Locus_ID和Type(。注意,";类型";必须根据对应于每行的指定值进行分配,我用下面的所需输出示例进行解释

Host         Seq_ID     Start    End     Strand      P     Locus_ID      Type
Human       >H3-USA      1      160      Forward    0.99   id "locus1"   Unique
Human       >H3-USA      121    240      Forward    0.98   id "locus5"   Conserved
Intermediate>I3-MM-CHA   121    240      Forward    0.98   id "locus5"   Conserved
Bat         >B2-RS-CHA   121    240      Forward    0.98   id "locus7"   Conserved
Consensus   >I3-MM-CHA   121    240      Forward    0.98   id "locus7"   Conserved
Human       >H3-USA      161    280      Forward    0.99   id "locus7"   Shared
so on...

然后我想总结一下;启动";以及";结束";位置是相等的;主机";是一样的。

我曾尝试使用";枢轴更长";功能,但未能使其工作。因此,我希望你能与我合作。我将非常感谢

以下是tidyverse中的一个解决方案,它通常利用dplyr,特别是tidyr::pivot_longer()

library(tidyverse)

# ...
# Code to generate 'data'.
# ...

data_2 <- data %>% pivot_longer(# "Unpivot" all columns except these:
cols = !c(Start, End, Type),
# Uses regex lookaround to target the last ".",
# used to separate column prefixes from suffixes.
names_sep = "\.(?!.*\.)",
# "Unpivot" to consolidate columns with the same
# prefix, and then record the suffixes in a new
# 'Suffix' column.
names_to = c(".value", "Suffix")) %>%
# Remove rows with missing 'Host'.
filter(!is.na(Host)) %>%
# Within each group of 'Host' and 'Type', select only the first row with each
# combo of 'Start' and 'End'.
group_by(Host, Type, Start, End) %>% filter(row_number() == 1) %>% ungroup() %>%
# Select the columns in proper order.
select(Host, Suffix, Seq_ID, Start, End, Strand, P, Locus_ID, Type) %>%
# Sort as desired.
arrange(Start, End) %>%
# Convert back into data.frame, if desired.
as.data.frame()

# Print the results.
data_2

给定数据集data,如您提供的示例

data <- structure(list(Host.H = c("Human", "Human", "Human", "Human", "Human", "Human"),
Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA"),
Start = c(1, 121, 161, 401, 721, 1081),
End = c(160, 240, 280, 520, 1040, 1280),
Strand.H = c("Forward", "Forward", "Forward", "Forward", "Forward", "Forward"),
P.H = c(0.995962, 0.985782, 0.997249, 0.983122, 0.998574, 0.993674),
Locus_ID.H = c("id "locus1"", "id "locus5"", "id "locus7"", "id "locus8"", "id "locus10"", "id "locus12""),
Host.I = c(NA, "Intermediate", NA, NA, NA, NA),
Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA),
Strand.I = c(NA, "Forward", NA, NA, NA, NA),
P.I = c(NA, 0.988441, NA, NA, NA, NA),
Locus_ID.I = c(NA, "id "locus5"", NA, NA, NA, NA),
Host.B = c(NA, "Bat", "Bat", "Bat", "Bat", NA),
Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", NA),
Strand.B = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.B = c(NA, 0.987457, 0.997273, 0.975433, 0.998187, NA),
Locus_ID.B = c(NA, "id "locus7"", "id "locus9"", "id "locus10"", "id "locus11"", NA),
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus", NA),
Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", NA),
Strand.C = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.C = c(NA, 0.98647, 0.997287, 0.981532, 0.998712, NA),
Locus_ID.C = c(NA, "id "locus7"", "id "locus9"", "id "locus10"", "id "locus12"", NA),
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared", "Unique")),
row.names = c(NA, -6L),
class = "data.frame")

您应该得到data_2的输出,如下所示:

Start  End      Type Suffix         Host     Seq_ID  Strand        P     Locus_ID
1      1  160    Unique      H        Human    >H3-USA Forward 0.995962  id "locus1"
2    121  240 Conserved      H        Human    >H3-USA Forward 0.985782  id "locus5"
3    121  240 Conserved      I Intermediate >I3-MM-CHA Forward 0.988441  id "locus5"
4    121  240 Conserved      B          Bat >B2-RS-CHA Forward 0.987457  id "locus7"
5    121  240 Conserved      C    Consensus >I3-MM-CHA Forward 0.986470  id "locus7"
6    161  280    Shared      H        Human    >H3-USA Forward 0.997249  id "locus7"
7    161  280    Shared      B          Bat >B2-RS-CHA Forward 0.997273  id "locus9"
8    161  280    Shared      C    Consensus >I3-MM-CHA Forward 0.997287  id "locus9"
9    401  520    Shared      H        Human    >H3-USA Forward 0.983122  id "locus8"
10   401  520    Shared      B          Bat >B2-RS-CHA Forward 0.975433 id "locus10"
11   401  520    Shared      C    Consensus >I3-MM-CHA Forward 0.981532 id "locus10"
12   721 1040    Shared      H        Human    >H3-USA Forward 0.998574 id "locus10"
13   721 1040    Shared      B          Bat >B2-RS-CHA Forward 0.998187 id "locus11"
14   721 1040    Shared      C    Consensus >I3-MM-CHA Forward 0.998712 id "locus12"
15  1081 1280    Unique      H        Human    >H3-USA Forward 0.993674 id "locus12"

最新更新