选择器类型的对象是不可JSON序列化的



我想刮一个动态网站,我需要硒。

我想要抓取的链接只有在我点击特定元素时才会打开。它们是由jQuery打开的,所以我唯一的选择是点击它们,因为没有href属性或任何可以给我URL的东西。

我的方法是这样的:

# -*- coding: utf-8 -*-
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
class AnofmSpider(scrapy.Spider):
name = 'anofm'

def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
callback=self.parse
)
def parse(self, response):  
driver = response.meta['driver'] 
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "tableRepeat2"))
)
finally:
html = driver.page_source
response_obj = Selector(text=html)

links = response_obj.xpath("//tbody[@id='tableRepeat2']")
for link in links:
driver.execute_script("arguments[0].click();", link)

yield {
'Ocupatia': response_obj.xpath("//div[@id='print']/p/text()[1]")
}

但是行不通。

在我想点击那个元素的那行,我得到了这个错误:

TypeError: Object of type Selector is not JSON serializable

我有点理解这个错误,但是我不知道如何解决它。我需要把那个对象从一个选择器转换成一个可点击的按钮。

我在网上查了解决方案和文档,但我找不到任何有用的。

谁能帮我更好地理解这个错误,我应该如何修复它?

谢谢。

实际上,数据也是由API调用JSON响应生成的,您可以轻松地从API中抓取数据。下面是使用分页的工作解决方案。每页8项,合计32项。

代码:

import scrapy
import json
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield scrapy.Request(
url='https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit=8&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': 8}
)

def parse(self, response):
resp = json.loads(response.body)
hits = resp.get('lmv').get('data')
for h in hits:
yield {
'Ocupatia': h.get('OCCUPATION')
}

total_limit = resp.get('lmv').get('total')
next_limit = response.meta['limit'] + 8
if next_limit <= total_limit:
yield scrapy.Request(
url=f'https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit={next_limit}&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': next_limit}
)

您将Scrapy对象与Selenium函数混合使用,这会产生问题。我不知道如何转换对象但我只会使用Selenium来处理这个

finally:
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))

for link in links:
# doesn't work for me - even
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()

# open information
driver.execute_script("arguments[0].click();", link)

# javascript may need some time to display it
time.sleep(1)

# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('n', 1)[0]        # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}

完整工作代码。

每个人都可以把它放在一个文件中,运行python script.py而不需要在scrapy中创建项目。

你必须将SELENIUM_DRIVER_EXECUTABLE_PATH更改为正确的路径。

import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
import time
class AnofmSpider(scrapy.Spider):
name = 'anofm'

def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
#callback=self.parse
)
def parse(self, response):  
driver = response.meta['driver'] 
try:
print("try")
element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, "//tbody[@id='tableRepeat2']/tr/td"))
)
finally:
print("finally")
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))

for link in links:
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()

# open information
driver.execute_script("arguments[0].click();", link)

# javascript may need some time to display it
time.sleep(1)

# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('n', 1)[0]        # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800},
'SELENIUM_DRIVER_NAME': 'firefox',
'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver',
'SELENIUM_DRIVER_ARGUMENTS': [], # ['-headless']
})
c.crawl(AnofmSpider)
c.start() 

最新更新