使用 Python 从不包含"target= "_blank" "的 url 列表中提取链接



我有代码可以从url列表中提取链接(并将它们与提取的url配对(,但我想更改它,只获取那些不包含target=&quot_空白";

from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
list_urls = ["https://example.com/1/","https://example.com/2/","https://example.com/3/"]
pagelinks = []
for url in list_urls:
browser.get(url)
soup = BeautifulSoup(browser.page_source,"html.parser")
for line in soup.find_all('a'):
href = line.get('href')
pagelinks.append([url, href])
df = pd.DataFrame(pagelinks, columns=["from", "to"])

尝试:

from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
list_urls = ["https://example.com/1/","https://example.com/2/","https://example.com/3/"]
pagelinks = []
for url in list_urls:
browser.get(url)
soup = BeautifulSoup(browser.page_source,"html.parser")
for line in soup.find_all('a'):
if not line.get('target') == '_blank':
href = line.get('href')
pagelinks.append([url, href])
df = pd.DataFrame(pagelinks, columns=["from", "to"])

最新更新