我有以下主数据帧:
first.seqnames first.start first.end first.width first.strand second.seqnames second.start second.end second.width second.strand
0 chr1 346212 346975 7 * chr1 10882 10888 7 *
1 chr1 3135476 3136100 2 * chr1 10890 10891 2 *
2 chr1 11473 11484 12 * chr1 10893 10904 12 *
3 chr1 5388140 5388730 2 * chr1 11096 11097 2 *
4 chr1 346213 346984 68 * chr1 11202 11269 68 *
我想返回在以下数据帧范围内不存在的上述数据帧的行:
first.seqnames first.start first.end 3 4 5
3503 chr1 346213 346984 . 0 .
3504 chr1 3135466 3136202 . 0 .
3505 chr1 3190760 3191377 . 0 .
3506 chr1 3354604 3355258 . 0 .
3507 chr1 5388136 5388749 . 0 .
这里,第一个数据帧的"first.start"one_answers"first.end"不应存在于范围(34621346984(内,。。。。。。。
我尝试了以下代码,它创建了内存和时间复杂性。甚至结果也不准确。这里,一些df1范围正好存在于df2范围和一些重叠之间。在重叠的情况下,可以忽略范围。
def range_subset(range1, range2):
"""Whether range1 is a subset of range2."""
if not range1:
return True # empty range is subset of anything
if not range2:
return False # non-empty range can't be subset of empty range
if len(range1) > 1 and range1.step % range2.step:
return False # must have a single value or integer multiple step
return range1.start in range2 and range1[-1] in range2
for a,b in zip(df1['first.start'], df1['first.end']):
for i,j in zip(df2['first.start'], df2['first.end']):
if(range_subset(range(a, b), range(i, j)) == True):
print(a,b)
输出:
first.seqnames first.start first.end first.width first.strand second.seqnames second.start second.end second.width second.strand
0 chr1 346212 346975 7 * chr1 10882 10888 7 *
2 chr1 11473 11484 12 * chr1 10893 10904 12 *
我最初从(df1['first.start'], df1['first.end']) and (df2['first.start'], df2['first.end'])
创建了元组对,以便应用range((函数。然后,我提出了一个条件,即df1_ranges
是否在df2_ranges
的范围内。这里的边缘情况是df1['first.start'] = df1['first.end']
。我从迭代中收集过滤后的索引,然后传递到df1中。
df2_lst=[]
for i,j in zip(temp_df2['first.start'], temp_df2['first.end']):
df2_lst.append(i)
df2_lst.append(j)
df1_lst=[]
for i,j in zip(df1['first.start'], df1['first.end']):
df1_lst.append(i)
df1_lst.append(j)
def range_subset(range1, range2):
"""Whether range1 is a subset of range2."""
if not range1:
return True # empty range is subset of anything
if not range2:
return False # non-empty range can't be subset of empty range
if len(range1) > 1 and range1.step % range2.step:
return False # must have a single value or integer multiple step
return range1.start in range2 and range1[-1] in range2
##### FUNCTION FOR CREATING CHUNKS OF LISTS ####
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i],lst[i+1]
df1_lst2 = list(chunks(df1_lst,2))
df2_lst2 = list(chunks(df2_lst,2))
indices=[]
for idx,i in enumerate(df1_lst2): #main list
x,y = i
for j in df2_lst2: #filter list
m,n = j
if((x!=y) & (range_subset(range(x,y), range(m,n)))): #checking if the main list exists in the filter range or not
indices.append(idx) #collecting the filtered indices
df1.iloc[indices]