Scraping Web to Get its contents with PyQt5 and Beautiful So



我正在尝试将这里给出的解决方案从 PyQt4 转换为 PyQt5 作为练习。

不知何故,收集的html代码在途中丢失了。我对了解正在发生的事情的方法进行了一些print()Callable方法的print()显示 HTML 代码。但是,在handleLoadFinished方法中,它是None的,因此,funAfuncB的功能不必工作。

我正在工作的代码是:

import sys, signal
from bs4        import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt5           import QtCore, QtGui
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage as QWebPage
class WebPage(QWebPage):
def __init__(self):
QWebPage.__init__(self)
self.loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.load(QtCore.QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
A = self.toHtml(self.Callable)
print('nnnnn')
print("####################### handleLoadFinished: ", A)
self._func(self._url, self.toHtml(self.Callable))
if not self.fetchNext():
print('# processing complete')
#self._exit()
def Callable(self, html_str):
self.html = html_str
print('####################  Callable html:', self.html)
def _exit(self):
print("exiting...")
QApplication.instance().quit()
def funcA(url, html):
print('# processing:', url)
print('html:', html)
soup = BeautifulSoup(html, "html.parser")
# do stuff with soup...
def funcB(url, html):
print('# processing:', url)
print('html:', html)
soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
items = [
('http://stackoverflow.com', funcA),
('http://google.com', funcB),
]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Ctrl+C to quitn')
app     = QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())

任何帮助我理解和纠正的建议将不胜感激!

在QtWebEngine中获取html是异步的,所以你得到None,而是你必须通过functools.partial()传递"self._func"来添加url:

from functools import partial
import signal
import sys
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage as QWebPage

class WebPage(QWebPage):
def __init__(self):
QWebPage.__init__(self)
self.loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.load(QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
wrapper = partial(self._func, self._url)
self.toHtml(wrapper)
if not self.fetchNext():
print("# processing complete")

def funcA(url, html):
print("# processing:", url)
print("html:", html)
soup = BeautifulSoup(html, "html.parser")

def funcB(url, html):
print("# processing:", url)
print("html:", html)
soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)

items = [
("http://stackoverflow.com", funcA),
("http://google.com", funcB),
]

def main():
signal.signal(signal.SIGINT, signal.SIG_DFL)
print("Press Ctrl+C to quitn")
app = QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())

if __name__ == "__main__":
main()

最新更新