如何使用python + Selenium从网站保存数据



我编写了一个脚本,该脚本逐个打开多个选项卡并从那里获取数据。现在我能够从页面获取数据,但是在编写CSV文件时,按如下方式获取数据。

Bedrooms    Bathrooms   Super area  Floor   Status
3 See Dimensions    3 See Dimensions    2100    7 (Out of 23 Floors)    3 See Dimensions
Bedrooms    Bathrooms   Super area  Floor   Status
3 See Dimensions    3 See Dimensions    2100    7 (Out of 23 Floors)    3 See Dimensions
Bedrooms    Bathrooms   Super area  Floor   Status
1   1   520 4 (Out of 40 Floors)    1
Bedrooms    Bathrooms   Super area  Floor   Status
3 See Dimensions    3 See Dimensions    2100    7 (Out of 23 Floors)    3 See Dimensions
Bedrooms    Bathrooms   Super area  Floor   Status
1   1   520 4 (Out of 40 Floors)    1

Status列中,我得到了错误的值。

我试过:

# Go through of them and click on each.
for unique_link in my_needed_links:
unique_link.click()
time.sleep(2)
driver.switch_to_window(driver.window_handles[1])
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]

search_entries = [
("Bedrooms", "//div[@class='seeBedRoomDimen']"),
("Bathrooms", "//div[@class='p_value']"),
("Super area", "//span[@id='coveredAreaDisplay']"),
("Floor", "//div[@class='p_value truncated']"),
("Lift", "//div[@class='p_value']")]
with open('textfile.csv', 'a+') as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
get_elements_by_xpath(driver, xpath)

编辑

条目:作为列表

[['3 See Dimensions'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit MonthlynSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', ''], ['2100'], ['7 (Out of 23 Floors)'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54 1 Discussion on forum', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit MonthlynSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', '']]
[['3 See Dimensions'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54 1 Discussion on forum', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit MonthlynSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', ''], ['2100'], ['7 (Out of 23 Floors)'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54 1 Discussion on forum', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit MonthlynSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', '']]

网站链接:https://www.magicbricks.com/propertyDetails/1-BHK-520-Sq-ft-Multistorey-Apartment-FOR-Sale-Kandivali-West-in-Mumbai&id=4d423333373433343431

编辑 1

my_needed_links = []
list_links = driver.find_elements_by_tag_name("a")
for i in range(0, 2):
# Get unique links.
for link in list_links:
if "https://www.magicbricks.com/propertyDetails/" in link.get_attribute("href"):
if link not in my_needed_links:
my_needed_links.append(link)
# Go through of them and click on each.
for unique_link in my_needed_links:
unique_link.click()
time.sleep(2)
driver.switch_to_window(driver.window_handles[1])
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
search_entries = [
("Bedrooms", "//div[@class='seeBedRoomDimen']"),
("Bathrooms", "//div[@class='p_value']"),
("Super area", "//span[@id='coveredAreaDisplay']"),
("Floor", "//div[@class='p_value truncated']"),
("Lift", "//div[@class='p_value']")]
#with open('textfile.csv', 'a+') as f_output:
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
data = [entry for entry in entries if len(entry)==28]
df = pd.DataFrame(data)
print (df)
df.to_csv('nameoffile.csv', mode='a',index=False,encoding='utf-8')
#df.to_csv('nameoffile.csv',mode='a', index=False,encoding='utf-8')
get_elements_by_xpath(driver, xpath)
time.sleep(2)
driver.close()
# Switch back to the main tab/window.
driver.switch_to_window(driver.window_handles[0])     

提前谢谢你。请提出一些建议

浴室和电梯的 xpath 是相同的,因此您可以在这些列中获得相同的结果。尝试找到另一种方法来识别和区分它们。您可能可以使用索引,但如果有其他方式,它通常是首选。

由于我的位置,我无法加载页面。但是从您的条目中,您可以执行以下操作:

#Your selenium imports
import pandas as pd
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]

for unique_link in my_needed_links:
unique_link.click()
time.sleep(2)
driver.switch_to_window(driver.window_handles[1])
search_entries = [
("Bedrooms", "//div[@class='seeBedRoomDimen']"), ("Bathrooms", "//div[@class='p_value']"),("Super area", "//span[@id='coveredAreaDisplay']"),("Floor", "//div[@class='p_value truncated']"),("Lift", "//div[@class='p_value']")]
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
data = [entry for entry in entries if len(entry)>5]
df = pd.DataFrame(data)
df.drop_duplicates(inplace=True)
df.to_csv('nameoffile.csv', sep=';',index=False,encoding='utf-8',mode='a')
get_elements_by_xpath(driver, xpath)