从站点提取数据时,数据重复



我试图提取每个公司名称,网站,电话,电子邮件并将该数据保存到excel文件中,公司名称被正确提取,问题是不是每个公司都有网站或电话或电子邮件,所以这段代码只是将它找到的第一个电话复制到所有其他公司。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from openpyxl import Workbook
url = "https://www.dmcc.ae/business-search?directory=1&submissionGuid=2c8df029-a92e-4b5d-a014-7ef9948e664b"
driver = webdriver.Firefox()
driver.get(url)
wait = WebDriverWait(driver, 50)
wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#hs-eu-confirmation-button"))
).click()
wait.until(
EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#pym-0 > iframe"))
)
list = wait.until(
EC.visibility_of_all_elements_located((By.CLASS_NAME, "searched-list "))
)
button = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "a[ng-click='setPage(pager.currentPage + 1)']")
)
)

comp_list=[]
# while (
#     # Last Page has disabled the li element
#     not "disabled"
#     in driver.find_element_by_css_selector(
#         "li[ng-class='{disabled:pager.currentPage === pager.totalPages}']"
#     )
#     .get_attribute("class")
#     .split()
# ):
for i in range(10):
driver.execute_script("arguments[0].scrollIntoView();", button)
for e in list:
name = e.find_element_by_tag_name("h4").text
website = e.find_element_by_xpath("//a[contains(@class, 'website')]").text
phone = e.find_element_by_xpath("//a[contains(@class, 'telephone')]").text
email = e.find_element_by_xpath("//a[contains(@class, 'emailid')]").text
comp = {
'name': name,
'website': website,
'phone': phone,
'email': email
}   
comp_list.append(comp)
wait.until(EC.invisibility_of_element((By.CLASS_NAME,"sfloadingBackground")))
button.click()
list = wait.until(
EC.visibility_of_all_elements_located((By.CLASS_NAME, "searched-list "))
)

print(comp_list)
df = pd.DataFrame(comp_list) 
print(df)
df.to_excel('comp.xlsx',index=False)
driver.quit()

下面的代码是提取数据(姓名,网站,电话,电子邮件)的代码部分

for e in list:
name = e.find_element_by_tag_name("h4").text
website = e.find_element_by_xpath("//a[contains(@class, 'website')]").text
phone = e.find_element_by_xpath("//a[contains(@class, 'telephone')]").text
email = e.find_element_by_xpath("//a[contains(@class, 'emailid')]").text

注意:外循环只是改变页面,每页有10家公司。

你可以把try和except变成一个方法等等,但是这样就能正确地设置一切。我也建议命名列表其他东西,因为它是一个关键字,但这取决于你。有时网站是空白的,所以你可以把默认值设置为空白,而不是None。

for i in range(10):
driver.execute_script("arguments[0].scrollIntoView();", button)
for e in list:
name = e.find_element_by_tag_name("h4").text
try:
website = e.find_element_by_xpath(".//a[contains(@class, 'website')]").text
except:
website = "None"
try:
phone = e.find_element_by_xpath(".//a[contains(@class, 'telephone')]").text
except:
phone = "None"
try:
email = e.find_element_by_xpath(".//a[contains(@class, 'emailid')]").text
except:
email = "None"
comp = {
'name': name,
'website': website,
'phone': phone,
'email': email
}   
comp_list.append(comp)
wait.until(EC.invisibility_of_element((By.CLASS_NAME,"sfloadingBackground")))
button.click()
list = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "searched-list ")))

print(comp_list)
df = pd.DataFrame(comp_list) 
print(df)

输出
[{'name': '1 BOXOFFICE DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '1 INTERNATIONAL FINCENTRE ASSOCIATES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '1000HEADS CONSULTING DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '108 CONSULTING DMCC', 'website': '', 'phone': '+97145845300', 'email': 'avinash@108dmcc.com'}, {'name': '1291 GROUP DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '12D MODEL DMCC', 'website': '', 'phone': '+97144295866', 'email': 'admin@gcsolutionsltd.co.uk'}, {'name': '168 ASSET MANAGEMENT SFO DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '19 FAMILY & BUSINESS DMCC', 'website': '', 'phone': '+971562160241', 'email': 'l.olivari@gmail.com'}, {'name': '1DIGI INVESTMENT MANAGEMENT DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '1ENERGIN DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '1F SOFT DMCC', 'website': '', 'phone': '+971555563077', 'email': 'mohamed@globallink.ae'}, {'name': '1GP HOLDINGS PTY. LTD. (DMCC BRANCH)', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '1ST CLASS D SFO DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '21ST CENTURY GROUP HOLDINGS LIMITED (BRANCH)', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '22 MARITIME DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '237 COMMUNICATIONS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '24TTL DIGITAL MARKETING SOLUTIONS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '25TH PARALLEL SERVICES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '271 CONTAINERS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '2A IT SERVICES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '2BEROSE DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '2DFINE SERVICES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '2E INTERNATIONAL DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '2G GLOBAL MARKETS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '2ID GROUP DMCC', 'website': 'www.2id-events.com', 'phone': '+971502377872', 'email': 'alia@2id-events.com'}, {'name': '2M KABLO DMCC', 'website': 'www.2mkablo.com', 'phone': '+971501643353', 'email': 'iccetin@2mkablo.com'}, {'name': '3 X DIAMONDS DMCC', 'website': '3xdiamondsdmcc.com', 'phone': '+97144341915', 'email': '3xdiamondsdmcc@gmail.com'}, {'name': '321 EVENTS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '333 TRADING DMCC', 'website': 'DEC TOWERS # 2 APARTMEN # 2306, dUBAI MARINA', 'phone': '+97143637752', 'email': 'afattah@333tradingjlt.ae'}, {'name': '33VOICES IQ DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '360 COMPRESSION SERVICES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '360 DECORO DMCC', 'website': 'www.360decoro.com', 'phone': '+971563531005', 'email': 'dubai@360decoro.com'}, {'name': '360 EXPANSION CONSULTING DMCC', 'website': '', 'phone': '+97143698916', 'email': 'office@360exco.com'}, {'name': '3A COMPOSITES MIDDLE EAST (BRANCH)', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '3B TRADING DMCC', 'website': '', 'phone': '+97144328300', 'email': 'khaled.bassatne@bbenergy.com'}, {'name': '3BFAB 3D PRINTING SOLUTIONS DMCC', 'website': '', 'phone': '+97142443135', 'email': 'info@3bfab-dmcc.ae'}, {'name': '3CORE DMCC', 'website': '', 'phone': '+971505583613', 'email': 'info@3core-os.com'}, {'name': '3CUBE TECHMED DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '3CX SOFTWARE DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '3D METAL TRADING DMCC', 'website': '', 'phone': '+97142955727', 'email': 'cs@m-hq.com'}, {'name': '3EX DIAM ME DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '3F MUSIC DMCC', 'website': '', 'phone': '+971555555553', 'email': 'fardinfard@gmail.com'}, {'name': '3FIVE8 TECHNOLOGIES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '3Y AGRICULTURE RESEARCH AND CONSULTANCY DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '4 SPACE MANAGEMENT DMCC', 'website': 'www.4space.ae', 'phone': '+97144385537', 'email': 'firas@4space.ae'}, {'name': '4 WALLS ART GALLERY DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '42 INTERNATIONAL DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '4E CONSULTANCY AND CONSTRUCTION SERVICES DMCC', 'website': '', 'phone': '+971503034051', 'email': 'operations@tamimiconsultancy.com'}, {'name': '4PM CONSULTING DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '4S DENTAL LABORATORY DMCC', 'website': '', 'phone': '+97144562950', 'email': 'dentallab4s@outlook.com'}, {'name': '4SIGHT BUSINESS CONSULTING DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '4SIGHT DRIVE DMCC', 'website': 'https://4sightglobal.com/', 'phone': '+97144548601', 'email': 'dinesh@4sight-global.com'}, {'name': '4SIGHT PLUS DMCC', 'website': 'https://4sightglobal.com/', 'phone': '+971505785140', 'email': 'anjan@4sight-global.com'}, {'name': '4SIGHT RESEARCH & ANALYTICS DMCC', 'website': 'https://4sightglobal.com/', 'phone': '+97144548601', 'email': 'mahesh@4sight-global.com'}, {'name': '4T COMMODITIES & EMERGING MARKETS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '50 NORTH AGRO DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '51 PEG TECHNOLOGIES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '5TH CONSULTING DMCC', 'website': 'www.5th.ae', 'phone': '+97142778085', 'email': 'v.alarde@5th.ae'}, {'name': '6 SEGMENTS SERVICES DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '609 EXPERIENCE DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '7 BROS DMCC', 'website': '', 'phone': '+97144458378', 'email': '7brosdmcc@gmail.com'}, {'name': '73 CO DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '748 AIR SERVICES DMCC', 'website': '', 'phone': '+971528109736', 'email': 'arjexports@arj.ca'}, {'name': '777 DIAMOND DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '777 RENT A CAR DMCC', 'website': '20:00', 'phone': '+971509729407', 'email': 'afkar_hassan2010@hotmail.com'}, {'name': '79TH GRP DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '7CLICK NETWORK DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '7HEAVEN TECHNOLOGY DMCC', 'website': 'www.7heaventech.com', 'phone': '+971 55 9478601', 'email': 'imran.s@7heaventech.com'}, {'name': '7PQRS DMCC', 'website': '', 'phone': '+971588631656', 'email': 'tigas.catherine@ymail.com'}, {'name': '7TH FLOOR CONSULTING DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '7TICKETS DMCC', 'website': '', 'phone': '+971501002473', 'email': 'feras@7tickets.com'}, {'name': '8 POINT MEDIA DMCC', 'website': '', 'phone': '+97143998158', 'email': 'sam@8pointmedia.com'}, {'name': '800 MEAT DMCC', 'website': '', 'phone': '+971506400204', 'email': 'fb-800meat@hotmail.com'}, {'name': '800DOCTOR DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '800TICKETS DMCC', 'website': '', 'phone': '+9718008425387', 'email': 'legal@itp.com'}, {'name': '818 VAULT DMCC', 'website': 'www.818vault.com', 'phone': '+971504581809', 'email': 'vik@818vault.com'}, {'name': '89 TECHNO CONSULTING COMPANY DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '8LABS GROUP COMMERCIAL BROKERS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': '8LANG DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A & A AVIATION TRADING DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A & A EXCELLENCE TRADING DMCC', 'website': '', 'phone': '+971559386418', 'email': 'said@asaworldwide.net'}, {'name': 'A & A SYNERGY DMCC', 'website': '', 'phone': '+971567560353', 'email': 'ajayhalwasiya@gmail.com'}, {'name': 'A & A TECHNOLOGIES DMCC', 'website': 'www.aatechnologies.net', 'phone': '+97144412268', 'email': 'faisal@aatechnologies.net'}, {'name': 'A & D INTERNATIONAL TRADING DMCC', 'website': '', 'phone': '+971507285246', 'email': 'andrea@and-dmcc.com'}, {'name': 'A 1 CONSULTANCY SERVICES DMCC', 'website': 'www.aone-consultancy.com', 'phone': '+971528869360', 'email': 'a1.jlt.dubai@gmail.com'}, {'name': 'A AND J DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A CEUTICALS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A CONSULT DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A DIGITAL COM DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A G STAR DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A GEM STAR DMCC', 'website': '.', 'phone': '+97144219251', 'email': 'agemstardmcc@gmail.com'}, {'name': 'A K A EXPORTS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A K DIGITAL CONSULTING MEA DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A K T MARITIME DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A M K BROKERS DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A ONE DIAM DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A R HOURIE ENTERPRISES GULF DMCC', 'website': 'www.hourie.com', 'phone': '+971585898079', 'email': 'gmouawad@hourie.com'}, {'name': 'A R IMPEX DMCC', 'website': '', 'phone': '+971558794024', 'email': 'arimpexdmcc@hotmail.com'}, {'name': 'A R J TRADING DMCC', 'website': 'None', 'phone': 'None', 'email': 'None'}, {'name': 'A SWEET SPOT DMCC', 'website': 'www.asweetspotevents.com', 'phone': '+971552388726', 'email': 'linaelfassi@asweetspotevents.com'}]

最新更新