试图用漂亮的汤刮其他类别



这是我试图抓取的网站:[https://www.jurongpoint.com.sg/store-directory/]

这是我的代码,正如你所看到的,我不知道如何填充url变量的{}作为我想要抓取的4个类别,特别是url的服务是非常不同的。url变量上面的注释在点击后显示了4个类别的链接。感谢任何帮助,谢谢!

from bs4 import BeautifulSoup
import requests
def parse():

cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services

url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:


for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")

for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "n")
print(desc)
except AttributeError as e:
print(e)

next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break

parse() 

使用请求的参数并避免管理转义字符(如%26)

url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'

最新更新