R-仅保留输入表的每一行唯一/不同的列

  • 本文关键字:一行 唯一 保留 r dplyr tidyr
  • 更新时间 :
  • 英文 :


我有一个非常大的数据框(nrow = 〜273,000),我以下示例为示例:每行都是蛋白质名称,并且具有各种数量的列列出了可以在人类细胞中找到的亚细胞结构。1)我想删除每行重复条目,并在此方面挣扎(下面的代码)。2)然后我想计算每个基因的数量(亚细胞结构)。

背景:我从Uniprot获得了这些数据,并尽我所能使用REGEX将其清除,但是在某些情况下,有重复条目的行(例如FMR1列出了染色体2X染色体2X,细胞质3X,Plasmammbrane 2x-进而 - 它们之间有一些空白列)

dput(df1)
structure(list(FMR1 = structure(c(41L, 3L, 17L, 63L, 16L, 24L, 
35L, 33L, 52L, 6L, 49L, 5L, 71L, 72L, 42L, 58L, 22L, 20L, 19L, 
80L, 9L, 51L, 66L, 64L, 23L, 14L, 60L, 45L, 28L, 54L, 7L, 30L, 
29L, 44L, 53L, 8L, 69L, 79L, 10L, 11L, 26L, 37L, 39L, 40L, 82L, 
73L, 18L, 21L, 27L, 47L, 4L, 46L, 1L, 13L, 36L, 70L, 74L, 67L, 
78L, 77L, 61L, 62L, 31L, 56L, 34L, 57L, 25L, 81L, 75L, 59L, 2L, 
65L, 55L, 38L, 50L, 68L, 32L, 12L, 43L, 15L, 48L, 76L), .Label = c("AAMP", 
"ADCY10 SAC", "AIMP1 EMAP2 SCYE1", "ANTXR2 CMG2", "APBB1 FE65 RIR", 
"APC DP2", "APLP1", "ARHGAP26 GRAF KIAA0621 OPHN1L", "ARL4A ARL4", 
"ATP6V0D1 ATP6D VPATPD", "ATP6V1D ATP6M VATD", "AZIN2 ADC KIAA1945 ODCP", 
"CACNB2 CACNLB2 MYSB", "CAMK2D CAMKD", "CDCA8 PESCRG3", "CDK1 CDC2 CDC28A CDKN1 P34CDC2", 
"CEMIP KIAA1199", "CIB1 CIB KIP PRKDCIP", "CLTA", "CLTB", "CMTM8 CKLFSF8", 
"DMD", "DSP", "ECT2", "EHD2 PAST2", "ENTPD2 CD39L1", "ERBB2 HER2 MLN19 NEU NGL", 
"EVPL", "FCHO1 KIAA0290", "FCHO2", "FGR SRC2", "GPER1 CEPR CMKRL2 DRY12 GPER GPR30", 
"HDAC6 KIAA0901 JM21", "ITCH", "ITGB1BP1 ICAP1", "KCTD7", "KIFC3", 
"MFN1", "MISP C19orf21", "MYOT TTID", "NGDN C14orf120", "NISCH IRAS KIAA0975", 
"NR1D1 EAR1 HREV THRAL", "PGM5 PGMRP", "PKP4", "PLA2G6 PLPLA9", 
"PNKD KIAA1184 MR1 TAHCCP2 FKSG19 UNQ2491/PRO5778", "POP7 RPP20", 
"PPL KIAA0568", "PRDX3 AOP1", "PTOV1 ACID2 PP642 UNQ6127/PRO20092", 
"PTPN23 KIAA1471", "PTPRE", "PTPRR ECPTP PTPRQ", "RAB13 GIG4", 
"RAB23 HSPC137", "RAB29 RAB7L1", "RAB30", "RAB38", "RAB40AL RLGP", 
"RAB8A MEL RAB8", "RAB9A RAB9", "RACGAP1 KIAA1478 MGCRACGAP", 
"RAP1B OK/SW-cl", "RGS8", "RPSA LAMBR LAMR1", "SGIP1", "SHMT2", 
"SHROOM3 KIAA1481 SHRML MSTP013", "SLC28A3 CNT3", "SNTA1 SNT1", 
"SNTB1 SNT2B1", "SNX11", "SNX12", "STOM BND7 EPB72", "TEX10 L18 Nbla10363", 
"TNFRSF8 CD30 D1S166E", "TNS4 CTEN PP14434", "TRIM72 MG53", "USP6 HRP1 TRE2", 
"VCL", "YES1 YES"), class = "factor"), Nucleus = structure(c(3L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("Mitochondrion  ", "Nucleus", "Nucleus  ", "Plasma membrane", 
"Plasma membrane  "), class = "factor"), Chromosome = structure(c(1L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L), .Label = c("Chromosome", "Cytoplasm", "Cytoplasm  "), class = "factor"), 
    Chromosome.1 = structure(c(4L, 5L, 7L, 5L, 14L, 12L, 20L, 
    18L, 5L, 20L, 20L, 2L, 1L, 1L, 8L, 10L, 19L, 1L, 1L, 8L, 
    16L, 16L, 17L, 19L, 20L, 21L, 15L, 13L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 9L, 10L, 
    16L, 16L, 16L, 22L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 11L, 
    7L, 14L, 9L, 17L, 11L, 9L, 2L, 6L, 6L, 17L, 18L, 10L, 1L, 
    1L, 17L, 19L, 19L, 1L, 3L, 5L, 1L), .Label = c("", " ", "Chromosome", 
    "Cytoplasm  ", "Cytoplasmic vesicle", "Cytoplasmic vesicle  ", 
    "Endoplasmic reticulum", "Endosome", "Endosome  ", "Golgi apparatus", 
    "Golgi apparatus  ", "Midbody", "Midbody  ", "Mitochondrion", 
    "Mitochondrion  ", "Nucleus", "Nucleus  ", "Perikaryon  ", 
    "Plasma membrane", "Plasma membrane  ", "Sarcoplasmic reticulum  ", 
    "Secreted"), class = "factor"), Cytoplasm = structure(c(1L, 
    15L, 13L, 10L, 1L, 13L, 1L, 1L, 5L, 2L, 11L, 1L, 1L, 1L, 
    5L, 8L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 14L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 5L, 9L, 2L, 3L, 6L, 7L, 2L, 1L, 2L, 4L, 11L, 12L, 
    5L, 1L, 1L, 1L, 7L, 3L, 1L, 2L, 2L, 2L), .Label = c("", " ", 
    "Cytoplasmic vesicle", "Endoplasmic reticulum", "Endosome", 
    "Endosome  ", "Golgi apparatus", "Golgi apparatus  ", "Golgi appartus", 
    "Midbody", "Mitochondrion  ", "Nucleus  ", "Plasma membrane", 
    "Plasma membrane  ", "Secreted  "), class = "factor"), Cytoplasm.1 = structure(c(1L, 
    4L, 7L, 7L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    6L, 3L, 2L, 1L, 1L, 1L), .Label = c("", " ", "Endoplasmic reticulum", 
    "Endoplasmic reticulum  ", "Endosome", "Mitochondrion", "Plasma membrane"
    ), class = "factor"), Cytoplasmic.vesicle = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", "Golgi apparatus"
    ), class = "factor"), Perikaryon = structure(c(2L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 
    1L, 1L, 1L, 1L), .Label = c("", " ", "Golgi apparatus"), class = "factor"), 
    X = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", 
    "Cytoplasmic granule"), class = "factor"), X.1 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", "Perikaryon"), class = "factor"), 
    X.2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA), X.3 = c(NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA), Plasma.membrane = c(NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA), Plasma.membrane.1 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
    )), .Names = c("FMR1", "Nucleus", "Chromosome", "Chromosome.1", 
"Cytoplasm", "Cytoplasm.1", "Cytoplasmic.vesicle", "Perikaryon", 
"X", "X.1", "X.2", "X.3", "Plasma.membrane", "Plasma.membrane.1"
), class = "data.frame", row.names = c(NA, -82L))

我尝试只为每一行获得独特的列而没有运气,例如:

unique(df1) # Original data with repeats removed
dplyr::distinct(df1) # Retain only unique/distinct rows from an input tb

我认为问题是上述功能正在寻找与我想要的相同的行名。我想要每行不同的列。我当时正在考虑使用melt函数,但是由于每行都有奇数的列数。

我希望输出看起来像newDF

structure(list(FMR1 = structure(c(7L, 1L, 3L, 9L, 2L, 4L, 6L, 
5L, 8L), .Label = c("AIMP1 EMAP2 SCYE1", "CDK1 CDC2 CDC28A CDKN1 P34CDC2", 
"CEMIP KIAA1199", "ECT2", "HDAC6 KIAA0901 JM21", "ITGB1BP1 ICAP1", 
"NGDN C14orf120", "PTPN23 KIAA1471", "RACGAP1 KIAA1478 MGCRACGAP"
), class = "factor"), Nucleus = structure(c(2L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("Nucleus", "Nucleus  "), class = "factor"), 
    Chromosome = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L), .Label = c("Chromosome", "Cytoplasm"), class = "factor"), 
    Cytoplasmic.vesicle = structure(c(1L, 8L, 2L, 4L, 5L, 4L, 
    7L, 6L, 3L), .Label = c("Cytoplasm  ", "Endoplasmic reticulum", 
    "Endosome", "Midbody", "Mitochondrion", "Perikaryon  ", "Plasma membrane  ", 
    "Secreted  "), class = "factor"), Perikaryon = structure(c(1L, 
    2L, 3L, 3L, 1L, 3L, 1L, 1L, 1L), .Label = c("", "Endoplasmic reticulum  ", 
    "Plasma membrane"), class = "factor"), Plasma.membrane = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("FMR1", "Nucleus", 
"Chromosome", "Cytoplasmic.vesicle", "Perikaryon", "Plasma.membrane"
), class = "data.frame", row.names = c(NA, -9L))

从这里我想获得一个rowSums(df1),所以我考虑将每个术语胁迫到一个数字(例如细胞质囊泡= 1,nucleus = 1,内质网= 1等),但是在这个虚拟的问题上遇到了一个问题-dataset。

df2 <- as.numeric(newDF)
Error: (list) object cannot be coerced to type 'double'
df2 <- as.numeric(newDF[,2:n])
Error in 2:n : NA/NaN argument

谢谢您的帮助。

编辑

我想为newDF中的每一行的计数,例如:

FMR1 5
NGDN C14orf120 3
AIMP1 EMAP2 SCYE1 4
CEMIP KIAA1199 4
RACGAP1 KIAA1478 MGCRACGAP 4
CDK1 CDC2 CDC28A CDKN1 P34CDC2 3
ECT2 4
ITGB1BP1 ICAP1 3
HDAC6 KIAA0901 JM21 3
PTPN23 KIAA1471 3

这可能是一种方法。由于您的预期结果是字符向量,因此我无法可视化最终输出。但是,您说您想检查数据中每种蛋白质出现多少列。我希望我拥有的结果就是您所追求的。

首先,我将所有列转换为字符。然后,我使用gather()将数据转换为长格式。对于每个亚细胞结构组(即亚细胞),我添加了行索引(例如,1表示原始数据中的第一行)和修剪白色空间。然后,用蛋白质中的NA去除所有行。用""" "卸下所有行。现在整理已经完成了。对于每一行(即row.index),删除重复的蛋白质类型。取消分组数据,最后计算出每种蛋白质出现多少列(即Sucellular结构)。基本上,您要计算此时的数据集中的每个蛋白质出现多少次。

使用您的示例数据,我得到了以下结果。但是我不确定这是否是您想要的。(我现在要上床睡觉。所以我无法帮助您几个小时。如果有人可以跳进去,请这样做。)

mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(), 
       protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
count(protein, sort = TRUE)

#                  protein     n
#                   <chr> <int>
# 1             Cytoplasm    82
# 2       Plasma membrane    70
# 3               Nucleus    25
# 4              Endosome     9
# 5         Mitochondrion     9
# 6   Cytoplasmic vesicle     8
# 7       Golgi apparatus     7
# 8 Endoplasmic reticulum     5
# 9               Midbody     3
#10            Perikaryon     3
# ... with 87 more rows

给定jjl"我的评论,我做了以下内容。我没有计算每个蛋白出现多少列,而是计算每行存在多少蛋白质名称。

mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(), 
       protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
count(row.index)
#   row.index     n
#       <int> <int>
# 1         1     4
# 2         2     6
# 3         3     5
# 4         4     6
# 5         5     4
# 6         6     5
# 7         7     4
# 8         8     4
# 9         9     5
#10        10     3
# ... with 72 more rows

编辑

如果要删除第一列(即FMR1),则可以通过过滤该列来完成此操作。在最后使用count()之前,我在代码中添加了filter(subcellular != "FMR1")

mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(), 
       protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
filter(subcellular != "FMR1") %>%
count(row.index)
# A tibble: 9 x 2
#  row.index     n
#      <int> <int>
#1         1     3
#2         2     4
#3         3     4
#4         4     4
#5         5     3
#6         6     4
#7         7     3
#8         8     3
#9         9     3

最新更新