web scraper最后的python分页



是否可以打印web上的所有页面,我有分页问题,它应该打印web上的所有页面,但它只显示9页,而在web上有24页,

我想打印所有的页面,直到最后一个,而在分页只有9个列表,而在web上有24页

import requests
from bs4 import BeautifulSoup
def login():    
print('test login')
urls = "https://www.yelp.com/search?cflt=contractors&find_loc=St%20Francis%20Wood%2C%20San%20Francisco%2C%20CA&start=0"
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
respon = requests.get(urls, headers=headers)
f = open('./re.html', 'w+')
f.write(respon.text)
f.close()
soup = BeautifulSoup(respon.text, 'html5lib')
page_item = soup.find_all('div', attrs={'class': 'pagination-link-container__09f24__13AN7'})
total_page = len(page_item)
print(len(page_item))
return total_page 
def get_url(page):
print('test url ...')
params = {
'start': page
}
respon = requests.get('https://www.yelp.com/search?cflt=contractors&find_loc=St%20Francis%20Wood%2C%20San%20Francisco%2C%20CA&start=0', params=params)

suop = BeautifulSoup(respon.text, 'html5lib')
titles = suop.find_all('span', attrs={'class': 'text__09f24__2tZKC text-color--black-regular__09f24__1QxyO text-align--left__09f24__3Drs0 text-weight--bold__09f24__WGVdT text-size--inherit__09f24__2rwpp'})
urls = []
for title in titles:
url = title.find('a')['href']
urls.append(url)
return  urls

def run():
total_page = login()
total_urls = []
for i in range(total_page):
page = i + 1
# print(page)
urls =  get_url(page)
total_urls += urls

# print(total_urls)
print(len(total_urls))

if __name__ == '__main__':
run()

因此,在浏览了分页链接之后,很明显您可以使用url末尾的参数来调整url。点击链接后,我发现了以下内容:

分页按钮 URL Header
1 start=0
2 start=20
3 start=40

最新更新