r-如何在一列中分离3种不同的信息

例如，在我拥有的列中，有一行写着'Ser25Phe'。我想把列HGVS.Consequence拆分为'Ser 25 Phe'。。。

HGVS.Consequence
Met1?
Met1?
Met1?
Ala2Glu
Ala2Ala
Asn3Asp
Asn3Asn
Gly4Trp
Gly4Arg
Ala6Glu
AsAsp
Arg9Arg
Lys10Arg
Lys10Lys
LeullLeu
Phe12Ser
Phe12Cys
lle13Leu
lle13Val
lle13Phe
Thr15Pro

另一个解决方案：

x <- c("Ala2Ala", "Asn3Asp", "Ser25Phe")
stringr::str_split(sub("(\d+)", ";\1;", x), ";", simplify = T)

一种不同形式的解决方案：

sub("(\d+)", " \1 ", x)

使用gsub，假设例如"AsAsp"也应拆分为"As Asp"。

trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |> 
gsub(pat='  ', rep=' ')  ## optional, to remove inner double whitespace
# [1] "Met 1 ?"    "Met 1 ?"    "Met 1 ?"    "Ala 2 Glu" 
# [5] "Ala 2 Ala"  "Asn 3 Asp"  "Asn 3 Asn"  "Gly 4 Trp" 
# [9] "Gly 4 Arg"  "Ala 6 Glu"  "As Asp"     "Arg 9 Arg" 
# [13] "Lys 10 Arg" "Lys 10 Lys" "Leull Leu"  "Phe 12 Ser"
# [17] "Phe 12 Cys" "lle 13 Leu" "lle 13 Val" "lle 13 Phe"
# [21] "Thr 15 Pro"

请参阅演示。

编辑

如果您的列位于类似的数据帧中

df <- data.frame(x1=rnorm(21), x2=runif(21), x3=x)

只需将其包装在transform:中

df |>
transform(x3=trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |> 
gsub(pat='  ', rep=' '))

或者，也许更好，

res <- trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |> 
strsplit(' ') |>
do.call(what=rbind) |>
{(.) replace(., . %in% c('', '?'), NA)}() |>
data.frame(df[1:2]) |>
type.convert(as.is=TRUE)
res
#       X1 X2   X3          x1         x2
# 1    Met  1 <NA>  1.33312448 0.83441710
# 2    Met  1 <NA> -0.55792615 0.48805921
# 3    Met  1 <NA>  1.38184166 0.73862824
# 4    Ala  2  Glu -0.87990439 0.42793122
# 5    Ala  2  Ala  0.59143575 0.23370509
# 6    Asn  3  Asp -0.15065801 0.92168932
# 7    Asn  3  Asn -1.59350802 0.58727950
# 8    Gly  4  Trp -0.21971055 0.69603185
# 9    Gly  4  Arg -0.14004599 0.36722717
# 10   Ala  6  Glu  0.31747188 0.54845522
# 11    As NA  Asp -0.07593689 0.41273905
# 12   Arg  9  Arg -0.54154181 0.12890089
# 13   Lys 10  Arg  1.09159765 0.19433579
# 14   Lys 10  Lys -0.71238122 0.28212593
# 15 Leull NA  Leu -0.68086189 0.89415476
# 16   Phe 12  Ser -0.05169070 0.48129061
# 17   Phe 12  Cys -0.21871795 0.06282263
# 18   lle 13  Leu -1.42723032 0.62185980
# 19   lle 13  Val  0.93924955 0.39333277
# 20   lle 13  Phe  0.71006152 0.22982191
# 21   Thr 15  Pro -0.66542079 0.66382062

其中：

class(res$X2)
# [1] "integer"

数据：

x <- c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp", 
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg", 
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu", 
"lle13Val", "lle13Phe", "Thr15Pro")

我们可以使用extract和tidyr将列拆分为多列

library(dplyr)
library(tidyr)
df %>%
extract(x3, into = c('v1', 'v2', 'v3'), 
"^([A-Za-z][a-z]+)(\d*)(\D+)", convert = TRUE) %>% 
na_if("?")

-输出

x1         x2    v1 v2   v3
1  -0.545880758 0.27253736   Met  1 <NA>
2   0.536585304 0.21981567   Met  1 <NA>
3   0.419623149 0.04366575   Met  1 <NA>
4  -0.583627199 0.07509480   Ala  2  Glu
5   0.847460017 0.39408293   Ala  2  Ala
6   0.266021979 0.36396781   Asn  3  Asp
7   0.444585270 0.25830122   Asn  3  Asn
8  -0.466495124 0.33670415   Gly  4  Trp
9  -0.848370044 0.46251084   Gly  4  Arg
10  0.002311942 0.85627913   Ala  6  Glu
11 -1.316908124 0.46591567    As NA  Asp
12  0.598269113 0.70118573   Arg  9  Arg
13 -0.762214370 0.54757268   Lys 10  Arg
14 -1.429090303 0.99911177   Lys 10  Lys
15  0.332244449 0.45370882 Leull NA  Leu
16 -0.469060688 0.29248872   Phe 12  Ser
17 -0.334986794 0.17262897   Phe 12  Cys
18  1.536252156 0.14751666   lle 13  Leu
19  0.609994533 0.48654307   lle 13  Val
20  0.516335698 0.24613129   lle 13  Phe
21 -0.074308561 0.27913013   Thr 15  Pro

数据

structure(list(x1 = c(-0.545880758366027, 0.536585304107612, 
0.419623148618683, -0.583627199210279, 0.847460017311944, 0.266021979364892, 
0.444585270360416, -0.466495123565759, -0.848370043948898, 0.00231194241576697, 
-1.31690812429962, 0.598269112694685, -0.7622143703459, -1.42909030324076, 
0.332244449013422, -0.469060687608488, -0.334986793584065, 1.53625215550584, 
0.609994533253692, 0.51633569843567, -0.0743085613231125), x2 = c(0.272537359734997, 
0.219815669581294, 0.0436657541431487, 0.0750948027707636, 0.39408292947337, 
0.36396780773066, 0.25830122246407, 0.336704148678109, 0.462510835379362, 
0.856279134983197, 0.465915669221431, 0.701185731682926, 0.547572682844475, 
0.999111766461283, 0.453708823537454, 0.292488717008382, 0.172628972679377, 
0.147516664350405, 0.486543073318899, 0.246131290215999, 0.279130134964362
), x3 = c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp", 
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg", 
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu", 
"lle13Val", "lle13Phe", "Thr15Pro")), class = "data.frame", row.names = c(NA, 
-21L))

编辑

数据

相关内容

最新更新

热门标签：