目前,我有这么宽的数据帧。
> dput(head(data))
structure(list(Host.H = c("Human", "Human", "Human", "Human",
"Human", "Human"), Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA",
">H3-USA", ">H3-USA", ">H3-USA"), Start = c(1L, 121L, 161L, 401L,
721L, 1081L), End = c(160L, 240L, 280L, 520L, 1040L, 1280L),
Strand.H = c("Forward", "Forward", "Forward", "Forward",
"Forward", "Forward"), P.H = c(0.995962, 0.985782, 0.997249,
0.983122, 0.998574, 0.993674), Locus_ID.H = c("id "locus1"",
"id "locus5"", "id "locus7"", "id "locus8"", "id "locus10"",
"id "locus12""), Host.I = c(NA, "Intermediate", NA, NA,
NA, NA), Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA),
Strand.I = c(NA, "Forward", NA, NA, NA, NA), P.I = c(NA,
0.988441, NA, NA, NA, NA), Locus_ID.I = c(NA, "id "locus5"",
NA, NA, NA, NA), Host.B = c(NA, "Bat", "Bat", "Bat", "Bat",
NA), Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA",
">B2-RS-CHA", NA), Strand.B = c(NA, "Forward", "Forward",
"Forward", "Forward", NA), P.B = c(NA, 0.987457, 0.997273,
0.975433, 0.998187, NA), Locus_ID.B = c(NA, "id "locus7"",
"id "locus9"", "id "locus10"", "id "locus11"", NA),
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus",
NA), Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA",
">I3-MM-CHA", NA), Strand.C = c(NA, "Forward", "Forward",
"Forward", "Forward", NA), P.C = c(NA, 0.98647, 0.997287,
0.981532, 0.998712, NA), Locus_ID.C = c(NA, "id "locus7"",
"id "locus9"", "id "locus10"", "id "locus12"", NA),
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared",
"Unique")), row.names = c(NA, 6L), class = "data.frame")
我一直在寻找整理这些数据的方法。要执行此操作,我需要将后缀为(.H、.I、.B和.C(的所有列按如下方式分组到每列中(Host、Seq_ID、Start、End、Strand、P、Locus_ID和Type(。注意,";类型";必须根据对应于每行的指定值进行分配,我用下面的所需输出示例进行解释
Host Seq_ID Start End Strand P Locus_ID Type
Human >H3-USA 1 160 Forward 0.99 id "locus1" Unique
Human >H3-USA 121 240 Forward 0.98 id "locus5" Conserved
Intermediate>I3-MM-CHA 121 240 Forward 0.98 id "locus5" Conserved
Bat >B2-RS-CHA 121 240 Forward 0.98 id "locus7" Conserved
Consensus >I3-MM-CHA 121 240 Forward 0.98 id "locus7" Conserved
Human >H3-USA 161 280 Forward 0.99 id "locus7" Shared
so on...
然后我想总结一下;启动";以及";结束";位置是相等的;主机";是一样的。
我曾尝试使用";枢轴更长";功能,但未能使其工作。因此,我希望你能与我合作。我将非常感谢
以下是tidyverse
中的一个解决方案,它通常利用dplyr
,特别是tidyr::pivot_longer()
:
library(tidyverse)
# ...
# Code to generate 'data'.
# ...
data_2 <- data %>% pivot_longer(# "Unpivot" all columns except these:
cols = !c(Start, End, Type),
# Uses regex lookaround to target the last ".",
# used to separate column prefixes from suffixes.
names_sep = "\.(?!.*\.)",
# "Unpivot" to consolidate columns with the same
# prefix, and then record the suffixes in a new
# 'Suffix' column.
names_to = c(".value", "Suffix")) %>%
# Remove rows with missing 'Host'.
filter(!is.na(Host)) %>%
# Within each group of 'Host' and 'Type', select only the first row with each
# combo of 'Start' and 'End'.
group_by(Host, Type, Start, End) %>% filter(row_number() == 1) %>% ungroup() %>%
# Select the columns in proper order.
select(Host, Suffix, Seq_ID, Start, End, Strand, P, Locus_ID, Type) %>%
# Sort as desired.
arrange(Start, End) %>%
# Convert back into data.frame, if desired.
as.data.frame()
# Print the results.
data_2
给定数据集data
,如您提供的示例
data <- structure(list(Host.H = c("Human", "Human", "Human", "Human", "Human", "Human"),
Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA"),
Start = c(1, 121, 161, 401, 721, 1081),
End = c(160, 240, 280, 520, 1040, 1280),
Strand.H = c("Forward", "Forward", "Forward", "Forward", "Forward", "Forward"),
P.H = c(0.995962, 0.985782, 0.997249, 0.983122, 0.998574, 0.993674),
Locus_ID.H = c("id "locus1"", "id "locus5"", "id "locus7"", "id "locus8"", "id "locus10"", "id "locus12""),
Host.I = c(NA, "Intermediate", NA, NA, NA, NA),
Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA),
Strand.I = c(NA, "Forward", NA, NA, NA, NA),
P.I = c(NA, 0.988441, NA, NA, NA, NA),
Locus_ID.I = c(NA, "id "locus5"", NA, NA, NA, NA),
Host.B = c(NA, "Bat", "Bat", "Bat", "Bat", NA),
Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", NA),
Strand.B = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.B = c(NA, 0.987457, 0.997273, 0.975433, 0.998187, NA),
Locus_ID.B = c(NA, "id "locus7"", "id "locus9"", "id "locus10"", "id "locus11"", NA),
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus", NA),
Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", NA),
Strand.C = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.C = c(NA, 0.98647, 0.997287, 0.981532, 0.998712, NA),
Locus_ID.C = c(NA, "id "locus7"", "id "locus9"", "id "locus10"", "id "locus12"", NA),
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared", "Unique")),
row.names = c(NA, -6L),
class = "data.frame")
您应该得到data_2
的输出,如下所示:
Start End Type Suffix Host Seq_ID Strand P Locus_ID
1 1 160 Unique H Human >H3-USA Forward 0.995962 id "locus1"
2 121 240 Conserved H Human >H3-USA Forward 0.985782 id "locus5"
3 121 240 Conserved I Intermediate >I3-MM-CHA Forward 0.988441 id "locus5"
4 121 240 Conserved B Bat >B2-RS-CHA Forward 0.987457 id "locus7"
5 121 240 Conserved C Consensus >I3-MM-CHA Forward 0.986470 id "locus7"
6 161 280 Shared H Human >H3-USA Forward 0.997249 id "locus7"
7 161 280 Shared B Bat >B2-RS-CHA Forward 0.997273 id "locus9"
8 161 280 Shared C Consensus >I3-MM-CHA Forward 0.997287 id "locus9"
9 401 520 Shared H Human >H3-USA Forward 0.983122 id "locus8"
10 401 520 Shared B Bat >B2-RS-CHA Forward 0.975433 id "locus10"
11 401 520 Shared C Consensus >I3-MM-CHA Forward 0.981532 id "locus10"
12 721 1040 Shared H Human >H3-USA Forward 0.998574 id "locus10"
13 721 1040 Shared B Bat >B2-RS-CHA Forward 0.998187 id "locus11"
14 721 1040 Shared C Consensus >I3-MM-CHA Forward 0.998712 id "locus12"
15 1081 1280 Unique H Human >H3-USA Forward 0.993674 id "locus12"