我如何捕获每个工作的URL,以便我可以在查看csv文件时打开完整的工作描述



有人可以帮助我修改这个脚本,使它也废料与每个作业相关联的URL。目的是当我在电子表格中浏览。csv文件时,如果我想了解更多关于这份工作的信息,我可以点击链接。提前谢谢你。

import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')

job = {
'title': title,
'company': company,
'salary': salary,
'summary': summary

}
joblist.append(job)
return

joblist = []
for i in range(0,90,10):
print(f'Getting page, {i}')
c = extract(0)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')

您可以使用以下其中一个

url = 'https://www.indeed.com' + item.find('a')['href']
url = 'https://www.indeed.com' + item.find('a').get('href')
url = 'https://www.indeed.com' + item.find('a').attrs['href']
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')

顺便说一句:

你总是加载相同的页面。要获得下一页,您必须在url中使用start=...
你可以使用dictionary和params=requests

payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': page,    
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
工作代码:

import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract(start):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}

payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': start,    
}

url= 'https://www.indeed.com/jobs'

r = requests.get(url, params=payload, headers=headers)

soup = BeautifulSoup(r.content, 'html.parser')

return soup

def transform(soup, joblist):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')

for item in divs:
title = item.find('a').text.strip()

url = 'https://www.indeed.com' + item.find('a')['href']
#url = 'https://www.indeed.com' + item.find('a').get('href')
#url = 'https://www.indeed.com' + item.find('a').attrs['href']
#url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
company = item.find('span', class_ = 'company').text.strip()

try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''

summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')

joblist.append({
'title': title,
'url': url,
'company': company,
'salary': salary,
'summary': summary
})

# --- main ---
joblist = []
for start in range(0, 90, 10):
print('Getting page', start)
c = extract(start)
transform(c, joblist)
df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
print(df.head())

最新更新