使用chrome 90和python 3.9。所有导入都已完全更新,因为我刚刚安装了它们。
由于我有一个糟糕的ISP,我制作这个脚本是为了在我的互联网关闭时将小说从互联网复制到文本文件中供离线观看。这个脚本几乎一直有效,直到递归错误弹出,然后我必须手动进入并在设置后更改章节。我从代码中得到的预期结果是运行,直到小说完全复制到文本文件(从第1章到#######(,无论有多少章。
在我复制了499或500章后,总是会出现递归错误。我不知道为什么它会这么低,也不知道它是如何出现这个错误的。我读过递归错误通常发生在999次迭代之后。
错误::(前2行重复了很长一段时间(
File "C:UsersjamesDocumentsNovelsPEERLESS MARTIAL GODnovel.py", line 42, in CopyChapter
NextChapter()
File "C:UsersjamesDocumentsNovelsPEERLESS MARTIAL GODnovel.py", line 49, in NextChapter
link = driver.find_element_by_link_text(cLink)
File "C:Program FilesPython39libsite-packagesseleniumwebdriverremotewebdriver.py", line 428, in find_element_by_link_text
return self.find_element(by=By.LINK_TEXT, value=link_text)
File "C:Program FilesPython39libsite-packagesseleniumwebdriverremotewebdriver.py", line 976, in find_element
return self.execute(Command.FIND_ELEMENT, {
File "C:Program FilesPython39libsite-packagesseleniumwebdriverremotewebdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "C:Program FilesPython39libsite-packagesseleniumwebdriverremoteremote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "C:Program FilesPython39libsite-packagesseleniumwebdriverremoteremote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "C:Program FilesPython39libsite-packagesurllib3request.py", line 78, in request
return self.request_encode_body(
File "C:Program FilesPython39libsite-packagesurllib3request.py", line 170, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "C:Program FilesPython39libsite-packagesurllib3poolmanager.py", line 375, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "C:Program FilesPython39libsite-packagesurllib3connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "C:Program FilesPython39libsite-packagesurllib3connectionpool.py", line 445, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:Program FilesPython39libsite-packagesurllib3connectionpool.py", line 440, in _make_request
httplib_response = conn.getresponse()
File "C:Program FilesPython39libhttpclient.py", line 1347, in getresponse
response.begin()
File "C:Program FilesPython39libhttpclient.py", line 331, in begin
self.headers = self.msg = parse_headers(self.fp)
File "C:Program FilesPython39libhttpclient.py", line 225, in parse_headers
return email.parser.Parser(_class=_class).parsestr(hstring)
File "C:Program FilesPython39libemailparser.py", line 67, in parsestr
return self.parse(StringIO(text), headersonly=headersonly)
File "C:Program FilesPython39libemailparser.py", line 56, in parse
feedparser.feed(data)
File "C:Program FilesPython39libemailfeedparser.py", line 176, in feed
self._call_parse()
File "C:Program FilesPython39libemailfeedparser.py", line 180, in _call_parse
self._parse()
File "C:Program FilesPython39libemailfeedparser.py", line 295, in _parsegen
if self._cur.get_content_maintype() == 'message':
File "C:Program FilesPython39libemailmessage.py", line 594, in get_content_maintype
ctype = self.get_content_type()
File "C:Program FilesPython39libemailmessage.py", line 578, in get_content_type
value = self.get('content-type', missing)
File "C:Program FilesPython39libemailmessage.py", line 471, in get
return self.policy.header_fetch_parse(k, v)
File "C:Program FilesPython39libemail_policybase.py", line 316, in header_fetch_parse
return self._sanitize_header(name, value)
File "C:Program FilesPython39libemail_policybase.py", line 287, in _sanitize_header
if _has_surrogates(value):
File "C:Program FilesPython39libemailutils.py", line 57, in _has_surrogates
s.encode()
RecursionError: maximum recursion depth exceeded while calling a Python object
代码::
#! python3
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from unidecode import unidecode
CHROMEDRIVER_PATH = 'C:Program FilesPython39chromedriver.exe'
NovelChapter = 'peerless-martial-god/chapter-1-spirit-awakening.html'
BaseURL = 'https://novelfull.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelChapter}
options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
def Close():
driver.stop_client()
driver.close()
driver.quit()
# start copy of chapter and add to a file
def CopyChapter():
global soup
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='chapter-content')
name = driver.title
filename = name.replace('<',' ').replace('"',' ').replace('>',' ').replace('/',' ').replace("|",' ').replace("?",' ').replace("*",' ').replace(":", ' -').replace('Read ',"").replace(' online free from your Mobile, Table, PC... Novel Updates Daily ',"").replace(' online free - Novel Full',"")
file_name = (filename + '.txt')
print(file_name)
data = ''
for data in soup.find_all("p"):
myfile = open(file_name, 'a+')
myfile.write(unidecode(data.get_text())+'n'+'n')
myfile.close()
global lastURL
lastURL = driver.current_url
print('**********Chapter Copied!**********')
NextChapter()
# end copy of chapter and add to a file
# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
bLink = soup.find(id = "next_chap")
cLink = 'Next Chapter'
link = driver.find_element_by_link_text(cLink)
link.click()
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()
CopyChapter()
#EOF
看起来不如defs好,但非常适合我的需要。添加了一些内容,例如为文本文件创建文件夹和从章节列表页面开始。可能有很多东西可以优化,但它确实有效,这对我来说才是最重要的
#! python3
import os
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from unidecode import unidecode
CHROMEDRIVER_PATH = 'C:Program FilesPython39chromedriver.exe'
def Close():
driver.stop_client()
driver.close()
driver.quit()
global NovelName
NovelName = ['']
global DIR
global baseDIR
baseDIR = "C:/Users/james/Documents/Novels"
while NovelName:
NN = NovelName.pop(-1)
NNx = NN.replace('.html', '').replace('-', ' ').upper()
DIR = '%(B)s/%(N)s' % {'B': baseDIR, "N": NNx}
os.mkdir(DIR)
BaseURL = 'https://novelfull.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NN}
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
print(url)
global currentURL
currentURL = driver.current_url
global lastURL
lastURL = ''
soupx = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
ChapterList = soupx.find(id='list-chapter')
CL = []
for i in ChapterList.find_all("li"):
CL.append(i)
NovelChapter1Raw = CL[0]
xx=[]
for i in NovelChapter1Raw.find_all("a"):
for x in i.find_all("span"):
xx.append(x)
ChapterTextX = ' '.join(map(str, xx))
ChapterText = ChapterTextX.replace('<span class="chapter-text">','').replace('</span>','')
BaseURL = 'https://novelfull.com'
link = driver.find_element_by_link_text(ChapterText)
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": link}
link.click()
currentURL = driver.current_url
while currentURL != lastURL:
global soup
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='chapter-content')
name = driver.title
filename = name.replace('<',' ').replace('"',' ').replace('>',' ').replace('/',' ').replace("|",' ').replace("?",' ').replace("*",' ').replace(":", ' -').replace('Read ',"").replace(' online free from your Mobile, Table, PC... Novel Updates Daily ',"").replace(' online free - Novel Full',"")
file_name = (filename + '.txt')
print(file_name)
data = ''
for data in soup.find_all("p"):
myfile = open(DIR +'/'+ file_name, 'a+')
myfile.write(unidecode(data.get_text())+'n'+'n')
myfile.close()
lastURL = driver.current_url
print('**********Chapter Copied!**********')
bLink = soup.find(id = "next_chap")
cLink = 'Next Chapter'
link = driver.find_element_by_link_text(cLink)
link.click()
currentURL = driver.current_url
print('Finished!!!')
Close()
print('Finished!!!')
Close() #<- throws a bunch of errors but makes sure everything closes.
#EOF