我可以使用executemany与sqlite3的大型批处理过程吗?



我有一个相当长的python过程,它意味着从一个表中重新处理大量数据,提取必要的部分,分配适当的值并将其重新输入到另一个表中。这个过程非常有效!除了它非常耗费时间。我想知道是否有一种方法来修改这个使用执行方法?这些是整个过程的片段,但描述了我希望修改为执行命令的地方

下面的代码运行一系列if/then语句,为检索到的信息分配适当的值。

# Current crawl begin date
cur.execute('SELECT Crawl_Begin_Date FROM Crawl WHERE Crawl_ID=?', current_crawl_ID)
current_crawl_begin_date = cur.fetchone()
current_crawl_begin_date = current_crawl_begin_date[0]

# Current crawl end date
cur.execute('SELECT Crawl_End_Date FROM Crawl WHERE Crawl_ID=?', current_crawl_ID)
current_crawl_end_date = cur.fetchone()
current_crawl_end_date = current_crawl_end_date[0]

# URL_Crawl table where Crawl_ID == current crawl
sql = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID=%s'
current_crawl = pd.read_sql_query(sql %(current_crawl_ID), con=db)
# num keeps track of number of rows read through from current_crawl (number of nodes in current crawl)
num = 1
# For every unique url in the current crawl
for row in current_crawl.itertuples():

# Calculate max date .......................................................
if ((row.Last_Updated == None) | (row.Last_Updated == '')):
last_updated = '0'
else:
last_updated = row.Last_Updated
if ((row.Last_Published == None) | (row.Last_Published == '')):
last_published = '0'
else:
last_published = row.Last_Published
if ((row.Date_of_HTML == None) | (row.Date_of_HTML == '')):
date_of_html = '0'
else:
date_of_html = row.Date_of_HTML

if ((last_updated >= last_published) & (last_updated >= date_of_html)):
max_date = last_updated
elif ((last_published >= last_updated) & (last_published >= date_of_html)):
max_date = last_published
elif ((date_of_html >= last_updated) & (date_of_html >= last_published)):
max_date = date_of_html
# ..........................................................................

# Set remaining variables from current_crawl dateframe
url_crawl_id = row.URL_Crawl_ID
unique_url_id = row.Unique_URL_ID
# Initialize starting and end dates/statuses with None
starting_date = None
starting_date_status = None
ending_date = None
ending_date_status = None

# URL_Crawl table up until (but not including) current crawl
sql2 = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID<%s'
previous_crawls = pd.read_sql_query(sql2 %(current_crawl_ID), con=db)
# If row's unique_url_id exists in previous crawls (not a new node)
if (unique_url_id in (previous_crawls['Unique_URL_ID']).tolist()):

# Situation B ...................................................

# Finding most recent lifetime of existing node
existing = previous_crawls[previous_crawls['Unique_URL_ID'] == unique_url_id]
existing_url_crawl_ids = (existing.URL_Crawl_ID).tolist()

existing_in_lifetime = pd.DataFrame()

for i in existing_url_crawl_ids:
sql3 = 'SELECT * FROM Lifetime WHERE URL_Crawl_ID=%d'
exist_in_lt = pd.read_sql_query(sql3 %(i), con=db)
existing_in_lifetime = existing_in_lifetime.append(exist_in_lt, ignore_index=True)

most_recent_lifetime = existing_in_lifetime[existing_in_lifetime.Lifetime_ID == existing_in_lifetime.Lifetime_ID.max()]

# Dates/statuses from most recent lifetime - convert to Strings
most_recent_starting_date = ((most_recent_lifetime.Starting_Date).tolist())[0]
most_recent_starting_date_status = ((most_recent_lifetime.Starting_Date_Status).tolist())[0]
most_recent_ending_date = ((most_recent_lifetime.Ending_Date).tolist())[0]
most_recent_ending_date_status = ((most_recent_lifetime.Ending_Date_Status).tolist())[0]
most_recent_lifetimeID = ((most_recent_lifetime.Lifetime_ID).tolist())[0]

if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)):
# Situation B.2
ending_date = max_date
ending_date_status = "Exact"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = None
ending_date_status = None
cur.execute("""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
""", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)):
# Situation B.3
ending_date = max_date
ending_date_status = "Exact"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
cur.execute("""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
""", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif (max_date == most_recent_starting_date):
# Situation B.4
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))
elif ((max_date > current_crawl_end_date) | (max_date < most_recent_starting_date)):
# Situation B.1
max_date = '0'
if (max_date == '0'):
# Situation B.5
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))

# If row's unique_url_id is a new node (not seen in previous crawls)
else:

# Situation A ...................................................

if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)):
# Situation A.2
starting_date = max_date
starting_date_status = "Exact"
elif (max_date < current_crawl_begin_date):
# Situation A.3
starting_date = max_date
starting_date_status = "Exact"
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
elif (max_date > current_crawl_end_date):
# Situation A.1
max_date = '0'
if (max_date == '0'):
# Situation A.4
starting_date = current_crawl_end_date
starting_date_status = "Estimated"

cur.execute("""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
""", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))

executemany可以在此容量中使用吗?如果是这样的话,我不知道executemany的合适语法——我已经尝试了一些还不起作用的东西。数据库为SQLite,程序基于python。

如果不完全理解你的代码,很难给出一个确切的答案。我不太明白你在哪里迭代url/id等。您将需要在循环之外创建更新列表和插入列表,然后将参数序列累加到相应的列表中。最后,在循环之后,您将把每个列表传递给executemany,并使用您想要执行的固定SQL。

这应该会让你了解循环/迭代是如何工作的。


#...
# These are each a list of tuples/lists
# ie. [(param0, ..., paramN), ..., (param0, ..., paramN)]
params_to_update = []
params_to_insert = []
# For every unique url in the current crawl
for row in current_crawl.itertuples():
#...
if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)):
# Situation B.2
ending_date = max_date
ending_date_status = "Exact"
params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = None
ending_date_status = None
params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)):
# Situation B.3
ending_date = max_date
ending_date_status = "Exact"
params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
# After for loop is done.
# Call UPDATE for each sequence of params in this list.
UPDATE_SQL = """UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? WHERE Lifetime_ID=?"""
cur.executemany(UPDATE_SQL, params_to_update)
# Call INSERT for each sequence of params in this list.
INSERT_SQL = """INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)"""
cur.executemany(INSERT_SQL, params_to_insert)

stackoverflow.com using-executemany-to-update-entries-in-an-existing-sqlite3-database-using-pyt

docs.python.org: python doc executemany example

最新更新