为什么这个try/except循环在错误时退出?



我有一个收集Reddit评论的脚本。它从一个带有链接列表的csv文件中提取。一些链接是死的,我得到404/403等错误。下面的代码将正确地识别它们并跳过,但它随后退出循环并完成生成csv文件的过程,而不继续到下一个链接。

import praw
import pprint
import csv
import os
import pandas as pd
from collections import namedtuple
from datetime import datetime
from pathlib import Path
def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
try:
for pid in df.id:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')
if __name__ == "__main__":
reddit_api = praw.Reddit(
client_id="####",
client_secret="####",
user_agent="####",
username="####",
password="####"
)
# reddit_api = init_praw(client_id, client_secret, user_agent, username, password)
csv_file = "####"
dest_dir = "####"
dest_name = "reddits_comments.csv"
Path(dest_dir).mkdir(parents=True, exist_ok=True)
dest = os.path.join(dest_dir, dest_name)
scrape_comments(reddit_api, csv_file, dest)

您应该将try/except放在代码的较小部分周围,正如评论中所说的那样。下面是一个例子:

def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
for pid in df.id:
try:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except Exception:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')

最新更新