网页抓取规则创建



我在这个页面上: http://www.metacritic.com/browse/games/title/ps4/a?view=condensed

我想进入每个项目并获取开发人员和流派,但我的代码似乎不起作用。

例如,我想进入此页面:http://www.metacritic.com/game/playstation-4/angry-birds-star-wars

然后离开它并继续完成其余的操作并添加到数据库中。我可以在代码中更改哪些内容以使其正常工作?现在数据库是用于开发的,流派为空,但它获取其余数据,所以它就像它永远不会进入parse_Game

此外,我将打印语句添加到parseGame中,但没有一个打印

from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from metacritic.items import MetacriticItem
import MySQLdb
import re
from string import lowercase
class MetacriticSpider(BaseSpider):
def start_requests(self):
    #iterate through ps4 pages
    for c in lowercase:
        for i in range(self.max_id):
            yield Request('http://www.metacritic.com/browse/games/title/ps4/{0}?page={1}'.format(c, i), callback = self.parseps4)
    #gets the developer and genre of a game
def parseGame(self, response):
    print("Here")
    item = response.meta['item']
    db1 = MySQLdb.connect("localhost", "root", "andy", "metacritic")
    cursor = db1.cursor()
    hxs = HtmlXPathSelector(response)   
    sites = hxs.select('//div[@class="product_wrap"]')
    items = []
    item['dev'] = site.xpath('.//span[contains(@class, "summary_detail developer")]/span[1]/text()').extract()
    item['genre'] = site.xpath('.//span[contains(@class, "summary_detail product_genre")]/span[1]/text()').extract()    
    cursor.execute("INSERT INTO ps4 (dev, genre) VALUES (%s,%s)",[item['dev'][0],item['genre'][0]])
    items.append(item)
    print item['dev']
    print item['genre']
def parseps4(self, response):
    #some local variables
    db1 = MySQLdb.connect("localhost", "root", "andy", "metacritic")
    cursor = db1.cursor()
    hxs = HtmlXPathSelector(response)   
    sites = hxs.select('//div[@class="product_wrap"]')
    items = []
    #iterates through each site
    for site in sites:
        with db1:
            item = MetacriticItem()
            #sets the item
            item['title'] = site.xpath('.//div[contains(@class, "basic_stat product_title")]/a/text()').extract()
            item['cscore'] = site.xpath('.//div[contains(@class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract() 
            item['uscore'] = site.xpath('.//div/ul/li/span[contains(@class, "data textscore")]/text()').extract()
            item['release'] = site.xpath('.//li[contains(@class, "stat release_date full_release_date")]/span[2]/text()').extract()
            #some processing to check if there is a score attached, if there is, it adds it to the database
            if ("tbd" in item['cscore'][0] and "tbd" not in item['uscore'][0]) or ("tbd" not in item['cscore'][0] and "tbd" in item['uscore'][0]) or ("tbd" not in item['cscore'][0] and "tbd" not in item['uscore'][0]):
                cursor.execute("INSERT INTO ps4 (title, criticalscore, userscore, releasedate) VALUES (%s,%s,%s, %s)",[(' '.join(item['title'][0].split())).replace("(PS4)","",1),item['cscore'][0],item['uscore'][0],item['release'][0]])
                items.append(item)
            itemLink = site.xpath('.//div[contains(@class, "basic_stat product_title")]/a/@href' ).extract()
            req = Request('http://www.metacritic.com' +  itemLink[0], callback = self.parseGame)
            req.meta['item'] = item

代码中的几个问题:

  • 元参数应包含字典{'item': item}
  • HtmlXPathSelector已弃用 - 改用Selector
  • 我认为您不应该在蜘蛛中执行mysql插入 - 改用数据库管道:
    • 在 Scrapy 中将项目写入 MySQL 数据库
  • 您需要获取extract()调用的第一项并对其进行strip()(这将有助于在字段中使用字符串,而不是列表,并且没有前导和尾随空格和换行符)

下面是没有mysql相关调用的代码:

from string import lowercase
from scrapy.item import Field, Item
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector, Selector
from metacritic.items import MetacriticItem

class MetacriticSpider(BaseSpider):
    name = 'metacritic'
    allowed_domains = ['metacritic.com']
    max_id = 1 # your max_id value goes here!!!
    def start_requests(self):
        for c in lowercase:
            for i in range(self.max_id):
                yield Request('http://www.metacritic.com/browse/games/title/ps4/{0}?page={1}'.format(c, i), callback=self.parseps4)
    def parseGame(self, response):
        item = response.meta['item']
        hxs = HtmlXPathSelector(response)
        site = hxs.select('//div[@class="product_wrap"]')
        # get additional data!!!
        yield item
    def parseps4(self, response):
        hxs = Selector(response)
        sites = hxs.select('//div[@class="product_wrap"]')
        for site in sites:
            item = MetacriticItem()
            item['title'] = site.xpath('.//div[contains(@class, "basic_stat product_title")]/a/text()').extract()[0].strip()
            item['cscore'] = site.xpath('.//div[contains(@class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract()[0].strip()
            item['uscore'] = site.xpath('.//div/ul/li/span[contains(@class, "data textscore")]/text()').extract()[0].strip()
            item['release'] = site.xpath('.//li[contains(@class, "stat release_date full_release_date")]/span[2]/text()').extract()[0].strip()
            link = site.xpath('.//div[contains(@class, "basic_stat product_title")]/a/@href').extract()[0]
            yield Request('http://www.metacritic.com/' + link, meta={'item': item}, callback=self.parseGame)

它对我有用 - 我在控制台上看到了parseGame()产生的项目。

确保它首先产生项目,然后查看!!!注释 - 相应地填写这些行。

之后,如果您在控制台上看到项目,请尝试创建数据库管道以将项目写入 mysql。

最新更新