我是python的初学者,我已经开始与网页抓取,我想从旅游网站提取数据,我需要酒店的名称,每个酒店的可用安排和价格,但我被困在安排列表,每个酒店可以有几个安排,但它不起作用,我不知道为什么。我把我的代码和输出提供给你的处置,如果你可以帮助我,提前感谢你。
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
PATH = "C:\Users\marketing2\Documents\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# write script //Your Script Seems fine
script = "document.getElementById('ville_des').value ='Sousse';document.getElementById('depart').value ='05/08/2021';document.getElementById('checkin').value ='05/08/2021';document.getElementById('select_ch').value = '1';"
# Execute script
driver.execute_script(script)
# click bouton search
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# click bouton details
btn_plus = driver.find_element_by_id('plus_res')
btn_plus.click()
sleep(10)
#getting the hotel names and by xpath in a loop
hotels=[]
pensions=[]
for v in range(1, 5):
hotel = driver.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div/div[2]/div/div[4]/div[' + str(v) + ']/div/div[3]/div[1]/div[1]/span/a/h3').get_attribute('innerHTML')
for j in range (1,3):
pension= driver.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div/div[2]/div/div[4]/div[1]/div/div[3]/div[3]/div[1]/div[1]/form/div[1]/div[' + str(j) + ']/u').get_attribute('innerHTML')
pensions.append((pension))
hotels.append((hotel,pensions))
print(hotels)
你可以试试
#!/usr/bin/env python
# coding: utf-8
import json
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
# create path and start webdriver
PATH = "/mnt/sdc/Work/scripts/Test/chromedriver"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'Sousse',
'date_from': '05/08/2021',
'date_to': '05/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
form = driver.find_element_by_id('hotel_recherch_moteur')
form.submit()
sleep(5)
# ----------------------------------------------------------------------------
# get list of all hotels
hotels_list = []
hotels_objects = driver.find_elements_by_xpath(
'//div[contains(@class, "enveloppe_produit")]'
)
for hotel_obj in hotels_objects:
# get price object
price_object = hotel_obj.find_element_by_xpath(
'.//div[@class="monaieprix"]'
)
price_value = price_object.find_element_by_xpath(
'.//div[1]'
).text.replace('n', '')
# get title data
title_data = hotel_obj.find_element_by_xpath(
'.//span[contains(@class, "tittre_hotel")]'
)
# get arrangements
arrangements_obj = hotel_obj.find_elements_by_xpath(
'.//div[contains(@class, "angle")]//u'
)
arrangements = [ao.text for ao in arrangements_obj]
# create new object
hotels_list.append({
'name': title_data.find_element_by_xpath('.//a//h3').text,
'arrangements': arrangements,
'price': f'{price_value}'
})
# ----------------------------------------------------------------------------
for hotel in hotels_list:
print(json.dumps(hotel, indent=4))
{
"name": "El Mouradi Palace",
"arrangements": [
"Petit dejeuner",
"Demi pension plus",
"All inclusive soft"
],
"price": "67"
}
{
"name": "KANTA",
"arrangements": [
"Petit dejeuner",
"Demi pension",
"All inclusive soft"
],
"price": "43"
}
...
如果我帮你,请把答案标记为正确
您应该通过tag_name
或xpath
在hotels
变量中找到所需的元素。
试试下面的代码。
hotels = driver.find_elements_by_xpath("//div[starts-with(@id,'produit_affair_')]")
hotel_list = []
for hotel in hotels:
name = hotel.find_element_by_tag_name("h3").text
prize = hotel.find_element_by_xpath("//div[starts-with(@id,'prixtotal_')]").text
hotel_list.append((name,prize))
print(hotel_list)
输出:
[('El Mouradi Palace', '56'), ('KANTA', '56'), ('Occidental Sousse Marhaba', '56'), ('Sindbad Center GAS', '56'), ('Sousse palace', '56'), ('Tui Blue Scheherazade', '56'), ('Golf Residence GAS', '56'), ('Iberostar Kantaoui Bay', '56'), ('Iberostar Diar el Andalous', '56'), ('Riadh Palms', '56'), ('Seabel AlHambra Beach Golf & Spa', '56'), ('Sousse City & Beach Hotel', '56'), ('Thalassa Sousse', '56'), ('Marhaba Palace', '56'), ('Palmyra Aqua Park ex soviva', '56'), ('Houria Palace', '56'), ('Cosmos Tergui Club', '56'), ('Marhaba Beach', '56'), ('Marhaba Club', '56'), ('Sousse Pearl Marriott Resort & Spa', '56')]