如何对表标记进行索引,以便返回链接列表的pandas-df



我正在尝试获取链接列表的第二表元素,并将它们存储为panda数据帧,为了完成这项任务,我定义了一个函数getCitySalaryTable():

from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
'https://www.salario.com.br/profissao/abade-cbo-263105']
def getCitySalaryTable(job_title_urls, city_salary_df):
for url in job_title_urls:
original_url= url
url = requests.get(url)
soup=BeautifulSoup(url.text, 'lxml')
tables=soup.find_all('table', attrs={'class':'listas'})

# I suspect the problem is here #
city_salary_table=tables[1]
#################################

# extracting column names
heads= city_salary_table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
# extracting rows 
data = {k:[] for k in colnames}
rows = city_salary_table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
#print(data)
# Constructing a pandas dataframe using the data just parsed
"""
adding keys: cbo, job_title
"""
cbo = original_url.split('/')[-1].split('-')[-1]
job_title = original_url.split('/')[-1].split('-')[0]
df = pd.DataFrame.from_dict(data)
df.insert(0,'cbo','')
df['cbo'] = cbo

df.insert(1, 'job_title', '')
df['job_title'] = job_title

city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)

return city_salary_df

然而,当应用时:

city_salary_df = pd.DataFrame()
city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)

它只为第一个链接返回一个数据帧,我怀疑函数(city_salary_table=tables[1](中的索引对其他链接不正确。

#      cbo      job_title  ... Salário/Hora Total
#0  612510  abacaxicultor  ...         6,16    29
#1  612510  abacaxicultor  ...         5,96     6
#2  612510  abacaxicultor  ...         6,03     4
#3  612510  abacaxicultor  ...        16,02     4
#4  612510  abacaxicultor  ...         4,75     3
#5  612510  abacaxicultor  ...         5,13     3
#[6 rows x 9 columns]

如何正确地告诉函数只返回所有链接的第二个表?

如果它确实是第二个表,请使用类型的第n个

soup.select_one('table:nth-of-type(2)')

尽管类别选择器比类型选择器更快

soup.select_one('.listas:nth-of-type(2)')
import request
from bs4 import BeautifulSoup as bs
soup = bs(requests.get('https://www.salario.com.br/profissao/abacaxicultor-cbo-612510').text, 'lxml')
soup.select_one('.listas:nth-of-type(2)')

您的最后一个链接没有那个表,所以添加一个检查city_salary_table is None:

from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
'https://www.salario.com.br/profissao/abade-cbo-263105',
'https://www.salario.com.br/profissao/abadessa-cbo-263105',
'https://www.salario.com.br/profissao/abanador-na-agricultura-cbo-622020']
def getCitySalaryTable(job_title_urls, city_salary_df):
for url in job_title_urls:
r = requests.get(url)
print()
soup=BeautifulSoup(r.text, 'lxml')
# I suspect the problem is here #
city_salary_table = soup.select_one('.listas:nth-of-type(2)')
#################################
if city_salary_table is not None:
# extracting column names
heads= city_salary_table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
# extracting rows 
data = {k:[] for k in colnames}
rows = city_salary_table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
#print(data)
# Constructing a pandas dataframe using the data just parsed
"""
adding keys: cbo, job_title
"""
cbo = url.split('/')[-1].split('-')[-1]
job_title = url.split('/')[-1].split('-')[0]
df = pd.DataFrame.from_dict(data)
df.insert(0,'cbo','')
df['cbo'] = cbo
df.insert(1, 'job_title', '')
df['job_title'] = job_title
city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)
return city_salary_df

city_salary_df = pd.DataFrame()
city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)
print(city_salary_df)

谷歌标签:

我认为Google Colab使用的是soupsieve的古老版本,我们没有看到第n种类型的未实现错误报告。相反,您可以使用city_salary_table = soup.select_one('table + table')

最新更新