尝试从https://www.adapt.io/directory/industry/telecommunications/A-1抓取公司名称,并以格式抓取数据,
({"公司名称":" A &L人事服务","company_location":"Gregory, Michigan"company_website"; http://www.cac.net ", " company_webdomain ": " cac.net ",company_industry: Telecommunications, company_employee_size:None," company_revenue ":无," contact_details ": [{" contact_name ": " Doug Waite ",contact_jobtitle: " owner ", contact_email_domain: " cac.net ", contact_department:"财务和行政"},{" contact_name ": " Jim Mason ", " contact_jobtitle ": " Club ." contact_email_domain ": " cac.net ", " contact_department ": " Other "}]},…)
但是每个字段都得到空值。
import scrapy
class CompanyProfileSpider(scrapy.Spider):
name = 'companyDetails'
start_urls = ["https://www.adapt.io/directory/industry/telecommunications/A-1"]
def parse(self, response):
for company in response.xpath("//div[contains(@class, 'DirectoryList_linkItemWrapper__3F2UE ')]"):
name = company.css('a::text').get()
company_portal = company.css('a::attr(href)').get()
if company_portal is not None:
next_page = response.urljoin(company_portal)
yield scrapy.Request(next_page, callback=self.company_parse)
def company_parse(self, response):
block = response.xpath("//span[contains(@class, 'CompanyTopInfo_infoValue__27_Yo')]")
data = block.xpath("//div[contains(@class, 'CompanyTopInfo_contentWrapper__2Jkic')]")
output = {}
# for i in data:
# output[i.css('span::text').get()] = i.xpath("//span[contains(@class, 'CompanyTopInfo_infoValue__27_Yo')]").css('span::text').get()
#yield{
# 'company_data': output,
#}
company_employee = response.xpath("//div[contains(@class, 'TopContacts_roundedBorder__1a3yB undefined')]")
employee_url = company_employee.xpath("//div[contains(@class, 'TopContacts_contactName__3N-_e')]").css('a::attr(href)').getall()
for url in employee_url:
if url is not None:
next_page = response.urljoin(url)
yield scrapy.Request(next_page, callback=self.employee)
def employee(self, response):
company_name=response.xpath( "//div[@class='info-wrapper']//h1//text()").get(),
company_location = response.xpath("//div[@class='info-wrapper']//li//span[text()='Location']//following-sibling::node()[2]/text()").get(),
company_website = response.xpath("//div[@class='info-wrapper']//span[@class='website-url']/text()").get(),
company_industry= response.xpath("//div[@class='info-wrapper']//li//span[text()='Industry']//following-sibling::node()[2]/text()").get(),
company_employee_size = response.xpath("//div[@class='info-wrapper']//li//span[text()='Head Count']//following-sibling::node()[2]/text()").get(),
company_revenue = response.xpath("//div[@class='info-wrapper']//li//span[text()='Revenue']//following-sibling::node()[2]/text()").get(),
contact_items = response.xpath("//div[@class='top-contact-item']")
contact_name = contact_items.xpath('.//a//text()').get(),
contact_jobtitle = contact_items.xpath(".//p[@class='contact-role']//text()").get(),
email = response.xpath("//span[contains(@class,'ContactTopInfo_infoValue__DNIWM')]").css('span::text').get(),
contact_department = response.xpath("//span[contains(@class,'ContactTopInfo_infoValue__DNIWM')]")[2].css('span::text').get()
yield{
'company_name':company_name,
'company_location': company_location,
'company_website': company_website,
'company_industry': company_industry,
'company_employee_size': company_employee_size,
'company_revenue': company_revenue,
'contact_details':[
{
'contact_name':contact_name,
'contact_jobtitle': contact_jobtitle,
'email':email,
'contact_department':contact_department}
]
}
你的代码有两个问题:
.get()
后面有逗号你的XPath表达式不能工作了。例如:
company_location = response.xpath("normalize-space(//div[contains(@class, 'ContactTopInfo_companyDetailItem')]//span[text()='Location']/following-sibling::span[1])").get()
company_industry= response.xpath("normalize-space(//div[contains(@class, 'ContactTopInfo_companyDetailItem')]//span[text()='Industry']/following-sibling::span[1])").get()