Pasing link with Python , BeautifulSoup and Requests



无法从此元素获取链接:

<h3 class="proposition_name">
<a href="/newauto/auto-jeep-grand-cherokee-1834871.html">
<strong>Jeep Grand Cherokee 2019</strong>
</a>
</h3>

这是我的代码:

import requests
from bs4 import BeautifulSoup
URL = 'https://auto.ria.com/newauto/marka-jeep/'
HEADERS={'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36','accept':'*/*'}
def get_html(url,params=None):
r = requests.get(url,headers=HEADERS,params=params)
return r
def get_content(html):
soup = BeautifulSoup(html,'html.parser')
itemsdiv = soup.findAll('div', class_='proposition') #class that contains the upper element
cars = []
for itemdiv in itemsdiv:
cars.append({
'title': itemdiv.find('h3',class_='proposition_name').get_text(strip=True),
})
print(itemdiv.find('a',href_='proposition_area').get_text())#here i am trying to get the link
print(cars)

def parse():
html = get_html(URL)
if html.status_code==200:
get_content(html.text)
else:
print('Error')
parse()

我尝试过的:

1(print(itemdiv.find('a',href_='proposition_area').get_text())# gettin none

2(创建了另一个带有参数"a"的项目

itemsa = soup.findAll('a', class_='proposition')

然后是另一个 for 循环

for itema in itemsa:
print(itema.get('href'))

3(将每个项目打印为文本

for itemdiv in itemsdiv:
cars.append({
'title': itemdiv.find('h3',class_='proposition_name').get_text(strip=True),
})
print(itemdiv.get_text())

但是里面没有链接

import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable

p = PrettyTable()
p.field_names = ["Name", "Url"]

def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select_one("div#searchResult").select("h3.proposition_name")
for tar in target:
p.add_row([tar.a.get_text(strip=True), f"{url[:20]}{tar.a['href']}"])
print(p)

main("https://auto.ria.com/newauto/marka-jeep/")

输出:

+--------------------------+--------------------------------------------------------------------+
|           Name           |                                Url                                 |
+--------------------------+--------------------------------------------------------------------+
| Jeep Grand Cherokee 2019 | https://auto.ria.com/newauto/auto-jeep-grand-cherokee-1834871.html |
| Jeep Grand Cherokee 2018 | https://auto.ria.com/newauto/auto-jeep-grand-cherokee-1838297.html |
| Jeep Grand Cherokee 2019 | https://auto.ria.com/newauto/auto-jeep-grand-cherokee-1836192.html |
|    Jeep Compass 2019     |    https://auto.ria.com/newauto/auto-jeep-compass-1838186.html     |
|    Jeep Renegade 2019    |    https://auto.ria.com/newauto/auto-jeep-renegade-1838198.html    |
|    Jeep Wrangler 2018    |    https://auto.ria.com/newauto/auto-jeep-wrangler-1838190.html    |
|    Jeep Compass 2019     |    https://auto.ria.com/newauto/auto-jeep-compass-1838277.html     |
|    Jeep Wrangler 2017    |    https://auto.ria.com/newauto/auto-jeep-wrangler-1838228.html    |
| Jeep Grand Cherokee 2020 | https://auto.ria.com/newauto/auto-jeep-grand-cherokee-1834293.html |
| Jeep Grand Cherokee 2018 | https://auto.ria.com/newauto/auto-jeep-grand-cherokee-1810691.html |
+--------------------------+--------------------------------------------------------------------+
soup = BeautifulSoup(html,'html.parser')
for x in soup.find_all("h3", attrs={"class":"proposition_name"}):
print("{} , {}".format(x.find("a")['href'], x.text))

输出

/newauto/auto-jeep-grand-cherokee-1834871.html ,  Jeep Grand Cherokee 2019
/newauto/auto-jeep-grand-cherokee-1838297.html ,  Jeep Grand Cherokee 2018

全局复合

comps = []
for item in items:
comps.append({ 'title': item.find('h3', class_='proposition_name').get_text(strip = True),
'price': item.find('div', class_='proposition_price').get_text(strip=True),
'city': item.find('div', class_='proposition_region grey size13').get_text(strip=True),
'link': item.find("h3", attrs={"class": "proposition_name"}).a.get('href')
})
print(comps)

最新更新