CS50 问题集 6 (DNA) "Python",我无法计算间歇性 DNA 序列,我的代码在小型数据库中成功,在大型数据库中失败



我是编程初学者,所以我决定参加CS50课程。在Problem Set6(Python(中,我编写了代码,它对小型数据库有效,但对大型数据库失败了,所以我只寻求这个想法的帮助。这是课程页面,你可以在这里下载(从谷歌硬盘(

我的代码

import csv
from sys import argv

class DnaTest(object):
"""CLASS HELP: the DNA test, simply give DNA sequence to the program, and it searches in the database to
determine the person who owns the sample.
type the following in cmd to run the program:
python dna.py databases/small.csv sequences/1.txt """
def __init__(self):
# get filename from the command line without directory names "database" and "sequence"
self.sequence_argv = str(argv[2][10:])
self.database_argv = str(argv[1][10:])
# Automatically open and close the database file
with open(f"databases/{self.database_argv}", 'r') as database_file:
self.database_file = database_file.readlines()
# Automatically open and close the sequence file
with open(f"sequences/{self.sequence_argv}", 'r') as sequence_file:
self.sequence_file = sequence_file.readline()
# Read CSV file as a dictionary, function: compare_database_with_sequence()
self.csv_database_dictionary = csv.DictReader(self.database_file)
# Read CSV file to take the first row, function: get_str_list()
self.reader = csv.reader(self.database_file)
# computed dictionary from the sequence file
self.dict_from_sequence = {}
# returns the first row of the CSV file (database file)
def get_str_list(self):
# get first row from CSV file
self.keys = next(self.reader)
# remove 'name' from list, get STR only.
self.keys.remove("name")
return self.keys
# returns dictionary of computed STRs from the sequence file (key(STR): value(count))
def get_str_count_from_sequence(self):  # PROBLEM HERE AND RETURN DICTIONARY FROM IT !
for dna_seq in self.get_str_list():
self.dict_from_sequence.update({dna_seq: self.sequence_file.count(dna_seq)})
# compare computed dictionary with the database dictionaries and get the person name
def compare_database_with_sequence(self):
for dictionary in self.csv_database_dictionary:
dict_from_database = dict(dictionary)
dict_from_database.pop('name')
# compare the database dictionaries with sequence computed dictionary
shared_items = {k: self.dict_from_sequence[k] for k in self.dict_from_sequence if
k in dict_from_database and self.dict_from_sequence[k] == int(dict_from_database[k])}
if len(self.dict_from_sequence) == len(shared_items):
dict_from_database = dict(dictionary)
print(dict_from_database['name'])
break

# run the class and its functions (Program control)
if __name__ == '__main__':
RunTest = DnaTest()
RunTest.get_str_count_from_sequence()
RunTest.compare_database_with_sequence()

问题是

在函数get_str_count_from_sequence(self):中,我使用计数,这是有效的,但对于序列文件(例如5.txt(中的顺序序列,所需的序列是非顺序的,我无法比较每个连续序列的数量。我搜索了一下,但没有发现任何简单的东西。有些人使用Regex模块,另一些人使用re模块,我还没有找到解决方案。

测试代码:

从CS50站点:以pythondna.py databases/large.csv sequences/6.txt运行您的程序您的程序应该输出Luna

规格

来自CS50站点。

谢谢"Piyush Singh"我听从了你的建议,用re解决了这个问题。起初,我使用re选择了一个组(最长的序列(,并设置了匹配组,它是一个字典,然后我为每个STR取最大值,然后我清除了字典数据以存储下一个STR,在这里我更新了比较函数Dictionaries(从数据库中读取并从序列文件中计算(

import csv
from sys import argv
import re

class DnaTest(object):
"""CLASS HELP: the DNA test, simply give DNA sequence to the program, and it searches in the database to
determine the person who owns the sample.
type the following in cmd to run the program:
python dna.py databases/small.csv sequences/1.txt """
def __init__(self):
# get filename from the command line without directory names "database" and "sequence"
self.sequence_argv = str(argv[2][10:])
self.database_argv = str(argv[1][10:])
# Automatically open and close the database file
with open(f"databases/{self.database_argv}", 'r') as database_file:
self.database_file = database_file.readlines()
# Automatically open and close the sequence file
with open(f"sequences/{self.sequence_argv}", 'r') as sequence_file:
self.sequence_file = sequence_file.readline()
# Read CSV file as a dictionary, function: compare_database_with_sequence()
self.csv_database_dictionary = csv.DictReader(self.database_file)
# Read CSV file to take the first row, function: get_str_list()
self.reader = csv.reader(self.database_file)
# computed dictionary from the sequence file
self.dict_from_sequence = {}
self.select_max = {}
# returns the first row of the CSV file (database file)
def get_str_list(self):
# get first row from CSV file
keys = next(self.reader)
# remove 'name' from list, get STR only.
keys.remove("name")
return keys
# returns dictionary of computed STRs from the sequence file (key(STR): value(count))
def get_str_count_from_sequence(self):  # PROBLEM HERE AND RETURN DICTIONARY FROM IT !
for str_key in self.get_str_list():
regex = rf"({str_key})+"
matches = re.finditer(regex, self.sequence_file, re.MULTILINE)
# my code
for match in matches:
match_len = len(match.group())
key_len = len(str_key)
self.select_max[match] = match_len
#  select max value from results dictionary (select_max)
max_values = max(self.select_max.values())
if max_values >= key_len:
result = int(max_values / key_len)
self.select_max[str_key] = result
self.dict_from_sequence[str_key] = result
# clear compare dictionary to select new key
self.select_max.clear()
# compare computed dictionary with the database dictionaries and get the person name
def compare_database_with_sequence(self):
# comparison function between database dictionary and sequence computed dictionary
def dicts_equal(from_sequence, from_database):
""" return True if all keys and values are the same """
return all(k in from_database and int(from_sequence[k]) == int(from_database[k]) for k in from_sequence) 
and all(k in from_sequence and int(from_sequence[k]) == int(from_database[k]) for k in from_database)
def check_result():
for dictionary in self.csv_database_dictionary:
dict_from_database = dict(dictionary)
dict_from_database.pop('name')
if dicts_equal(self.dict_from_sequence, dict_from_database):
dict_from_database = dict(dictionary)
print(dict_from_database['name'])
return True
if check_result():
pass
else:
print("No match")

# run the class and its functions (Program control)
if __name__ == '__main__':
RunTest = DnaTest()
RunTest.get_str_count_from_sequence()
RunTest.compare_database_with_sequence()

检查解决方案

Run your program as python dna.py databases/small.csv sequences/1.txt. Your program should output Bob.
Run your program as python dna.py databases/small.csv sequences/2.txt. Your program should output No match.

有关更多检查,请访问CS50 DNA问题集

为了获得每个STR的最大连续STR数量,我只写了几行代码。这个想法是:你搜索一个STR,如果你找到了它,那么你搜索STRx2,如果再次找到,那么搜索STRx3,以此类推,直到你找不到STRxn,那么你的最大数字是n-1。由于STRxn总是连续的,所以如果发现任何不连续的内容,您不必担心。除了sys和csv之外,您不需要python库。我的整个代码不到30行。

enter code here
import csv
import sys
# check command-line arguments, expect 3 including dna.py
n = len(sys.argv)
if n != 3:
print("Usage: python dna.py data.csv sequence.txt")
exit(0)
with open(sys.argv[1], 'r') as database:  # read database
data_lines = csv.reader(database)  # read line-by-line, store in data_lines
data = [row for row in data_lines]  # convert to list of lists, store in data
with open(sys.argv[2], 'r') as sequences:
dna = sequences.read()  # read sequence data, store in string dna
counts = []  # list to store counts of the longest run of consecutive repeats of each STR
for i in range(1, len(data[0])):  # loop through all STR
count = 1
string = data[0][i]  # assign each STR to a string
while string * count in dna:  # if find 1 string, then try to find string*2, and so on
count += 1
counts.append(str(count - 1))  # should be decreased by 1 as initialized to 1. int to str
for j in range(1, len(data)):  # loop through all rows in database
if data[j][1:len(data[0])] == counts:  # compare only numebrs in each row to counts
print(data[j][0])  # print corresponding name
exit(0)
print('No Match')

最新更新