使用BeautifulSoup在多个页面上抓取完整职位描述时出错



如果能得到您的帮助/输入,那就太好了!

我正试着从确实抓取工作信息。代码中的一切都很好,直到我试图使用job href在多个页面中获得完整的职位描述。现在我一直得到以下错误:

job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")AttributeError: 'NoneType'对象没有属性'get'

请查看下面的代码:

import requests
from bs4 import BeautifulSoup
import pandas as pd
# Extract function
def extract(page):
headers = {
"User Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 
(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"}
url = f"https://uk.indeed.com/jobs?q=data+analyst+%C2%A330%2C000&l=London%2C+Greater+London&jt=fulltime&start={page}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.text, "html.parser")
return soup
# Transform function
def transform(soup):
# Get list of all job_postings
job_postings = soup.find_all(name="div", class_="slider_item")
# Get job elements
for job_posts in job_postings:
job_title = job_posts.select_one("a span[title]").text
company_name = job_posts.find(name="span", class_="companyName").text
try:
salary = job_posts.find(name="div", class_="salary-snippet").find("span").getText()
except:
salary = "n/a"
summary_text = job_posts.find(name="div", class_="job-snippet").text.replace("n", "")
# full job descriptions
job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
absolute_link = 'https://uk.indeed.com' + job_link
job_desc_r = requests.get(absolute_link)
job_desc_data = job_desc_r.text
job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")
full_description = [item.text for item in
job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]
# Append jobs to job list 
job = {
'Job Title': job_title,
'Company': company_name,
'Salary': salary,
'Summary': summary_text,
'Full Descriptions': full_description
}
job_list.append(job)
return

job_list = []
# Loop across multiple pages 
for page_num in range(0, 40, 10):
extract_output = extract(page_num)
transform(extract_output)

我已经尝试使用try和except来修复这个错误(见下文),但是这导致了很多空白的职位描述。

# Get full job descriptions
try:
job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
except:
job_link = ""
absolute_link = 'https://uk.indeed.com' + job_link
# For each Job's webpage you need to connect to the link
job_desc_r = requests.get(absolute_link)
job_desc_data = job_desc_r.text
job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")
try:
full_description = [item.text for item in
job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]
except:
full_description = ""

提前感谢!P.S.我在mac上使用PyCharmCE

不看原始网页抓取数据,我几乎可以肯定我理解为什么会发生错误。您正在检索的锚定元素很可能在href属性中不包含'link'。

HTML没有指定href属性是针对超链接的。例如,href可以是href="this.classList.add("class")"-这样href不包含链接。

检查href是否包含有效的超链接,而不是为所有锚元素创建job_posts的列。

if "http" in href:
#execute code
pass

这意味着在<a>标签中没有href属性,或者实际上根本没有找到<a>标签。

你可以使用try/except,就像你使用它来支付薪水一样:

import requests
from bs4 import BeautifulSoup
import pandas as pd
# Extract function
def extract(page):
headers = {
"User Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 
(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"}
url = f"https://uk.indeed.com/jobs?q=data+analyst+%C2%A330%2C000&l=London%2C+Greater+London&jt=fulltime&start={page}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.text, "html.parser")
return soup
# Transform function
def transform(soup):
# Get list of all job_postings
job_postings = soup.find_all(name="div", class_="slider_item")
# Get job elements
for job_posts in job_postings:
job_title = job_posts.select_one("a span[title]").text
company_name = job_posts.find(name="span", class_="companyName").text
print(company_name, job_title)
try:
salary = job_posts.find(name="div", class_="salary-snippet").find("span").getText()
except:
salary = "n/a"
summary_text = job_posts.find(name="div", class_="job-snippet").text.replace("n", "")
# full job descriptions
try:
job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
absolute_link = 'https://uk.indeed.com' + job_link
job_desc_r = requests.get(absolute_link)
job_desc_data = job_desc_r.text
job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")
full_description = [item.text for item in
job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]

except Exception as e:
print(e)
full_description = 'N/A'

# Append jobs to job list 
job = {
'Job Title': job_title,
'Company': company_name,
'Salary': salary,
'Summary': summary_text,
'Full Descriptions': full_description
}
job_list.append(job)
return

job_list = []
# Loop across multiple pages 
for page_num in range(0, 40, 10):
extract_output = extract(page_num)
transform(extract_output)

最新更新