在python/Snakemake中更好地分析唯一的文件名



我有一系列样本,它们将是名称唯一的标头。例如,样本可以标记为(SC892138_CTGAAGCT-ACTCTGAG或SC892138_unisample(_L001_001.star_rg_added.sorted.dmark.bam。或者(SC892155_CTGAAGT-AACTCTGAg或SC892155_unissample(_L01_001.stal_rg_added.sorted_dmark.bam,或者SC后面的其他数字。我可以解析出每个样本的SC#####值,但我想得到这个:SC892155_CTGAAGCT-ACTCTGAG。我该如何解析?

import os
import glob
import itertools
import pandas
from collections import defaultdict
workdir: os.environ['PWD']
## ---- The parser may have to be customized for each run ---- ##
def parse_sampleID(filename):
return filename.split('/')[-1].split('_')[0]

fastqs = glob.glob('/P/A/T/H/S/*star_rg_added.sorted.dmark.bam')
d = defaultdict(list)
for key, value in itertools.groupby(fastqs, parse_sampleID):
d[key] += list(value)
# Need to modify sampleIDs to bams not R1/R2 files
sampleIDs = d.keys()

也许您没有分享足够的例子来充分说明这个问题?

选项#1基于所提供的信息:

paths_n_fns = [
"/P/A/T/H/S/SC892138_CTGAAGCT-ACTCTGAG_L001_001.star_rg_added.sorted.dmark.bam",
"/P/A/T/H/S/SC892138_unisample_L001_001.star_rg_added.sorted.dmark.bam",
"/P/A/T/H/S/SC892155_CTGAAGCT-ACTCTGAG_L001_001.star_rg_added.sorted.dmark.bam",
"/P/A/T/H/S/SC892155_unisample_L001_001.star_rg_added.sorted.dmark.bam"
]
def parse_sampleID(path_n_filename):
if "unisample" in path_n_filename:
return path_n_filename.split('/')[-1].split('_')[0]
return path_n_filename.split('/')[-1].split('_L')[0]
sample_ids = []
for paths_n_fn in paths_n_fns:
sample_ids.append(parse_sampleID(paths_n_fn))
sample_ids

选项#1的输出

['SC892138_CTGAAGCT-ACTCTGAG',
'SC892138',
'SC892155_CTGAAGCT-ACTCTGAG',
'SC892155']

选项#2基于提供的信息:

paths_n_fns = [
"/P/A/T/H/S/SC892138_CTGAAGCT-ACTCTGAG_L001_001.star_rg_added.sorted.dmark.bam",
"/P/A/T/H/S/SC892138_unisample_L001_001.star_rg_added.sorted.dmark.bam",
"/P/A/T/H/S/SC892155_CTGAAGCT-ACTCTGAG_L001_001.star_rg_added.sorted.dmark.bam",
"/P/A/T/H/S/SC892155_unisample_L001_001.star_rg_added.sorted.dmark.bam"
]
def parse_sampleID(path_n_filename):
if "unisample" in path_n_filename:
return path_n_filename.split('/')[-1].split('_')[0]
parts = path_n_filename.split('/')[-1].split('_',2)
return "_".join([parts[0],parts[1]])
sample_ids = []
for paths_n_fn in paths_n_fns:
sample_ids.append(parse_sampleID(paths_n_fn))
sample_ids

选项#2的输出

['SC892138_CTGAAGCT-ACTCTGAG',
'SC892138',
'SC892155_CTGAAGCT-ACTCTGAG',
'SC892155']

最新更新