我使用Scrapy来抓取网站的项目。有很多信息,包括图像网址。你能帮我弄清楚如何从这些网址中提取图像吗(data["image_urls"]
(。
我知道我必须扩展媒体管道,因为 Scrapy 不管理嵌套的 url,但我在这个过程中迷失了方向。
import json
import scrapy
import re
import pkgutil
from scrapy.loader import ItemLoader
from auctions_results.items import AuctionItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from datetime import datetime
class Spider(scrapy.Spider):
name = 'results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data(
"auctions_results", "json/input/scrape_demo_db.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['gm_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
for caritem in response.css("div.car-item-border"):
data = AuctionItem()
data["marque"] = caritem.css("div.make::text").extract_first().strip().split(" ", 2)[1]
data["auction_house"] = caritem.css("div.auctionHouse::text").extract_first().split("-", 1)[0].strip()
data["auction_country"] = caritem.css("div.auctionHouse::text").extract_first().rsplit(",", 1)[1].strip()
data["auction_date"] = caritem.css("div.date::text").extract_first().replace(",", "").strip()
data["image_urls"] = caritem.css("div.view-auction a img::attr(src)").extract_first()
item['results'].append(data)
yield item
当我用 JSON 编码结果时,它看起来像这样:
[{
"gm_url": "url",
"results": [{
"marque": "ferrari",
"auction_house": "auction",
"auction_country": "japan",
"auction_date": "2019",
"image_urls": "imgurl"
},
{
"marque": "porsche",
"auction_house": "auction2",
"auction_country": "gb",
"auction_date": "2018",
"image_urls": "imgurl2"
}]
}, ....]
在 items.py 中添加图像项目,激活ITEM_PIPELINE并在 settings.py 中IMAGES_STORE。
你可以为此覆盖 ImagesPipeline 的 get_media_requests 方法:
class DownloadImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for result in item['results']:
image_url = result['image_urls']
request = Request(url=image_url,
headers=headers)
yield request
在您的设置中,您应该停用图像管道并将其替换为此"下载图像管道"。