从实际使用beautifulSoup中抓取作业url数据



我正在设计一个web scraper,它可以从"确实"中抓取作业。我能够成功地抓取,职位,公司,地点,工资信息。然而,我很难在确实上找到该特定工作的工作url,这样用户就可以找到更多信息以及如何申请。

这是我的代码:

import requests, json
from bs4 import BeautifulSoup
def extract(position, location, page):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
url = f'https://www.indeed.com/jobs?q={position}&l={location}&start={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
job_title = soup.find_all('table', class_='jobCard_mainContent')

for item in job_title:
title = item.find('h2').get_text()
company = item.find('span', class_='companyName').get_text()
location = item.find('div', class_='companyLocation').get_text()
try:
salary = item.find('div', class_='heading6 tapItem-gutter metadataContainer').get_text()
except:
salary = 'none'

job = {         
'Title': title,
'Company': company,
'Location': location,
'Salary': salary,
}
jobList.append(job)
return

jobList = []
a = extract('swe', 'nyc', 0)
transform(a)
data = json.dumps(jobList, indent=2)
print(data)

工作搜索:"swe",位置:"nyc">

[
{
"Title": "newWebsite Developer and Social Media Video maker",
"Company": "La Reserve",
"Location": "New York, NY",
"Salary": "none"
},
{
"Title": "Entry Level Computer Programmer",
"Company": "Revature",
"Location": "New York, NY+9 locations",
"Salary": "none"
},
{
"Title": "Entry Level Software Engineer",
"Company": "Revature",
"Location": "New York, NY+17 locations",
"Salary": "none"
},
{
"Title": "Junior CSS/HTML Developer",
"Company": "Revature",
"Location": "New York, NY+8 locations",
"Salary": "none"
},
{
"Title": "Front End Developer",
"Company": "Underdog.io",
"Location": "New York, NY",
"Salary": "none"
},
{
"Title": "2022 Software Engineer",
"Company": "Bloomberg",
"Location": "New York, NY 10261",
"Salary": "none"
},
{
"Title": "Front End Developer (Entry level)",
"Company": "Revature",
"Location": "New York, NY+8 locations",
"Salary": "none"
},
{
"Title": "Junior Software Developer",
"Company": "Revature",
"Location": "New York, NY+5 locations",
"Salary": "none"
},
{
"Title": "Payments Software Engineer, Apple Pay",
"Company": "Apple",
"Location": "New York, NY+3 locations",
"Salary": "none"
},
{
"Title": "Frontend React developer | In-house",
"Company": "ManageGo",
"Location": "Brooklyn, NY 11211",
"Salary": "none"
},
{
"Title": "Web Developer / HTML",
"Company": "Phoenix Technology Partners",
"Location": "New York, NY",
"Salary": "none"
},
{
"Title": "Junior Software Engineer",
"Company": "TransPerfect",
"Location": "New York, NY",
"Salary": "none"
},
{
"Title": "new2022 Software Engineer Program - Full Time Opportunity",
"Company": "JPMorgan Chase Bank, N.A.",
"Location": "New York, NY+76 locations",
"Salary": "none"
},
{
"Title": "newSoftware Engineer I (Junior/Entry-level Backend Engineer)",
"Company": "Boxed",
"Location": "New York, NY+1 location",
"Salary": "none"
},
{
"Title": "Software Developer (Full-Time, Entry-Level)",
"Company": "Accelerated Information Systems",
"Location": "Hicksville, NY 11801",
"Salary": "none"
}
]

在仔细查看网站后,我找到了这个解决方案。正如我在评论中所说,我向上移动了一个项目,这样URL就包含在所选的类中。URL扩展名由item['href]提供,我只是将其添加到基本URL中。

import requests, json
from bs4 import BeautifulSoup
base_url = 'https://www.indeed.com'
def extract(position, location, page):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
url = f'https://www.indeed.com/jobs?q={position}&l={location}&start={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
job_title = soup.find_all('a', class_="tapItem")
for item in job_title:
title = item.find('h2').get_text()
job_url = base_url + item['href']
title = item.find('h2').get_text()
company = item.find('span', class_='companyName').get_text()
location = item.find('div', class_='companyLocation').get_text()
try:
salary = item.find('div', class_='heading6 tapItem-gutter metadataContainer').get_text()
except:
salary = 'none'

job = {         
'Title': title,
'Company': company,
'Location': location,
'Salary': salary,
"Job Url": job_url
}
jobList.append(job)
return

jobList = []
a = extract('swe', 'nyc', 0)
transform(a)
data = json.dumps(jobList, indent=2)
print(data)

最新更新