我正在尝试解析猎头。kz网站。使用中:python 3.9, beautifulsoup4。当我解析有空缺的页面时,我只解析20个div块,使用"serp-item"类,实际上有40个div块。(我在浏览器中打开html文件,看到40块的存在)。
import requests
import os
import time
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
df = pd.DataFrame({})
global_url = "https://almaty.hh.kz/"
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
def get_all_pages():
with open("data/page_1.html") as file:
src = file.read()
#
soup = BeautifulSoup(src,"lxml")
#find("span", {"class":"pager-item-not-in-short-range"}).
pages_count = int(soup.find("div",{"class":"pager"}).find_all("a")[-2].text)
for i in range(1,pages_count+1):
url = f"https://almaty.hh.kz/search/vacancy?area=160&clusters=true&enable_snippets=true&ored_clusters=true&professional_role=84&professional_role=116&professional_role=36&professional_role=157&professional_role=125&professional_role=156&professional_role=160&professional_role=10&professional_role=150&professional_role=25&professional_role=165&professional_role=73&professional_role=96&professional_role=164&professional_role=104&professional_role=112&professional_role=113&professional_role=148&professional_role=114&professional_role=121&professional_role=124&professional_role=20&search_period=30&hhtmFrom=vacancy_search_list&page={i}"
r = requests.get(url = url,headers = headers)
with open(f"data/page_{i}.html","w") as file:
file.write(r.text)
time.sleep(3)
return pages_count+1
def collect_data(pages_count):
for page in range(1, pages_count+1):
with open(f"data/page_{page}.html") as file:
src = file.read()
soup = BeautifulSoup(src,"lxml")
# item_cards = soup.find_all("div",{"class":"a-card__body ddl_product_link"})
# print(len(item_cards))
# for items in item_cards:
# product_title = items.find("a",{"class":"a-card__title link"}).text
# product_price = items.find("span",{"class":"a-card__price-text"}).text
# product_geo = items.find("div",{"class":"a-card__subtitle"}).text
# print(f"Title:{product_title} - Price: {product_price} - GEO: {product_geo}")
#items_divs = soup.find_all("div",{"class":"serp-item"})
items_divs = soup.find_all("div",{"class":"serp-item"})
print(len(items_divs))
urls =[]
for item in items_divs:
item_url = item.find("span",{"data-page-analytics-event":"vacancy_search_suitable_item"}).find("a",{"class":"serp-item__title"}).get("href")
urls.append(item_url)
with open("items_urls.txt","w") as file:
for url in urls:
file.write(f"{url}n")
get_data(file_path="items_urls.txt")
def get_data(file_path):
result_list = []
with open(file_path) as file:
urls_list = file.readlines()
clear_urls_list =[]
for url in urls_list:
url = url.strip()
clear_urls_list.append(url)
i=0
for url in clear_urls_list:
i+=1
response = requests.get(url=url,headers=headers)
soup = BeautifulSoup(response.text,"lxml")
try:
item_name = soup.find("div",{"class":"main-content"}).find("h1",{"data-qa":"vacancy-title"}).text.strip()
except:
item_name = 'E1'
try:
item_salary = soup.find("div",{"class":"main-content"}).find("div",{"data-qa":"vacancy-salary"}).text.strip()
except:
item_salary = 'E2'
try:
item_exp = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-experience"}).text.strip()
except:
item_exp = 'E3'
try:
company_name = soup.find("div",{"class":"main-content"}).find("span",{"class":"vacancy-company-name"}).find("span").text.strip()
except:
company_name = 'E4'
try:
if soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}):
date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}).text.strip()
else:
date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time"}).text.strip()
except:
date = 'E5'
try:
if soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}):
address = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}).text
elif soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}):
address = soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}).text
elif soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}):
address = soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}).find("p", {"data-qa":"vacancy-view-location"}).text
except:
address = 'Алматы'
try:
zanyatost = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).find("span").text.strip()
except:
zanyatost = 'E7'
try:
zanyatost2 = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).text.lstrip(', ')
except:
zanyatost2 = 'E8'
print(i)
with open('test.csv','a',encoding ="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
item_name,
item_salary,
item_exp,
company_name,
date,
address,
zanyatost,
zanyatost2
)
)
def main():
with open('test.csv','w',encoding ="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
'Должность',
"Зарплата",
"Опыт",
"Компания",
"Дата обьявления",
"Район",
"Тип занятости",
"Тип занятости2"
)
)
pages_count = get_all_pages()
#print(pages_count)
collect_data(pages_count=pages_count)
# #get_data(file_path="items_urls.txt")
# df.to_excel('./test.xlsx')
if __name__ == '__main__':
main()
我尝试使用html5lib, html。解析器和lxml,但我得到相同的结果。我也试着用汤。选择查找带有"serp-item"类,但它给出的结果是一样的。我认为,信息从剩余块存储在JS,如果我是对的,有人能解释一下,如何解析剩余块?
我认为你应该使用selenium并尝试在解析任何数据之前滚动到页面末尾
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height