Target的Python网络爬虫



我是一个新手程序员,试图为 Target.com 部署python网络爬虫。我已经粘贴了下面的代码。

我试图解决的问题是,当我运行脚本时,在结束时没有创建 csv 文件。Web 浏览器打开,数据正在沿着精美的文本运行,所以我对为什么输出.csv没有出现感到困惑。

我将非常感谢您的帮助。谢谢!

import requests
import csv
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import html

cats = [
('Natural Laundry Detergent','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Z55t1q?Nao=0'),
('Natural All-Purpose Cleaner','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Zzag5n?Nao=0'),
('Natural Dish Soaps','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Zx6dg5?Nao=0'),
('Natural Hair Shampoo','https://www.target.com/c/natural-hair-care/-/N-4smdrZ56ecv?Nao=0'),
('Natural Hair Conditioner','https://www.target.com/c/natural-hair-care/-/N-4smdrZv1cqo?Nao=0'),
('Natural Body Wash','https://www.target.com/c/natural-personal-care/-/N-4smdpZ5td3p?Nao=0'),
('Baby Shampoo and Body Wash','https://www.target.com/c/baby-toiletries-bath-potty/-/N-5xtjdZ54wt4?Nao=0'),
('Baby Bath Wash' ,'https://www.target.com/c/baby-toiletries-bath-potty/baby-bath-wash/-/N-5xtjdZ5ri3m'),
('Baby Bubble Bath' ,'https://www.target.com/c/baby-toiletries-bath-potty/-/N-5xtjdZ5t3hx?Nao=0'),
('Stain Removers', 'https://www.target.com/s?searchTerm=stain+remover&facetedValue=56cpg&Nao=0'),
('Baby Lotions', 'https://www.target.com/c/baby-toiletries-bath-potty/baby-lotions/-/N-5xtjdZ5vg2t'),
('Tampons','https://www.target.com/c/tampons-feminine-products-personal-care/-/N-4y634'),
('Maxi Pads','https://www.target.com/c/maxi-pads-feminine-products-personal-care/-/N-4y633'),
('Feminine Hygiene','https://www.target.com/c/feminine-hygiene-products-personal-care/-/N-4y631'),
]

class TargetClient(object):
def __init__(self):
self.wd = webdriver.Chrome(executable_path=r'C:UserswquarAppDataLocalProgramsPythonPython37chromedriver.exe')
self.base_url = 'https://www.target.com'
def gather_product_links(self):
soup = BeautifulSoup(self.wd.page_source)
divs = soup.select('div[class*="ProductCardImageWrapper"]')
links = [self.base_url + d.a['href'] for d in divs]
return links
def goto_next_page(self):
ele = self.wd.find_element_by_xpath("//a[@aria-label='next page']")
ele.click()
time.sleep(1.5)

def _format_product_name(self,input):
out = input.replace('®','').replace('x99','')
return out
def _format_brand_name(self,input):
out = input.replace('®','').replace('x99','')
out = html.unescape(out)
if out == "Murphy's":
out = 'Murphy'
elif out == 'ECOS by Earth Friendly Products':
out = 'Ecos'
elif out == 'Eden Body Works':
out = 'EDEN BodyWorks'
elif out == 'BRÖÖ':
out = 'BRöö'
elif out == 'Love Beauty & Planet':
out = 'Love Beauty And Planet'
elif out == 'Hask':
out = 'HASK'
elif out == 'Palmers':
out = "Palmer's"
elif out == 'MacVoil':
out = "Macvoil"
elif out == 'Dear Clark,':
out = "Dear Clark"
elif out == 'Earth Science Naturals':
out = "Earth Science"
elif out == 'PAW Patrol':
out = "Paw Patrol"
elif out == 'up & up™':
out = "Up&Up"
elif out == 'Johnson & Johnson':
out = "Johnson's"
elif out == 'Earth Mama Angel Baby':
out = "Earth Mama"
elif out == 'Mielle Organics':
out = "Mielle"
elif out == 'EveryDay Coconut':
out = "Alaffia"
elif out == 'Olivina':
out = "OLIVINA MEN"
elif out == 'AVALON':
out = "Avalon"
elif out == 'Oxi Clean':
out = "OxiClean"
elif out == 'Village Naturals':
out = "Nourishing Naturals"
elif out == 'everyone':
out = "everyone"
elif out == 'Savannah Bee Company':
out = 'Savannah Bee'
elif out == 'Camille Rose Natural':
out = 'Camille Rose'
return out
def _get_product_name(self, complete_product_name, brand_name):
if brand_name == 'Alaffia':
return complete_product_name.split(' -')[0].strip()
elif brand_name == 'SoCozy' and 'So Cfl ozy' in complete_product_name:
return complete_product_name.split('So Cfl ozy')[1].split(' -')[0].strip()
elif brand_name == 'Ecos' and 'ECOS' in complete_product_name:
return complete_product_name.split('ECOS')[1].split(' -')[0].strip()
elif brand_name == 'Clorox 2' and 'Clorox2' in complete_product_name:
return complete_product_name.split('Clorox2')[1].split(' -')[0].strip()
product_name = complete_product_name.split(brand_name)[1].split(' -')[0].strip()
return product_name
def scrape_product_page(self, url, category):
r = requests.get(url)
soup = BeautifulSoup(r.content)
d = {}
try:
complete_product_name = soup('span',attrs={'data-test':'product-title'})[0].text
except:
print('ERROR')
return None
complete_product_name = self._format_product_name(complete_product_name)
print(complete_product_name)
brand_name = soup.select('div[class*="ProductDetailsTitle"]')[0].text.split('Shop all')[-1].strip()
brand_name = self._format_brand_name(brand_name)
d['Brand'] = brand_name
#return (complete_product_name, brand_name)
try:
product_name = self._get_product_name(complete_product_name,brand_name)
except:
print('PRODUCT ERROR')
print('PRODUCT ERROR')
return None
d['Product'] = product_name
try:
d['Capacity'] = soup('b',text='Capacity (Volume):')[0].next.next.strip()
except:
d['Capacity'] = self._parse_capacity_from_title(complete_product_name)
try:
d['Scent'] = soup('b',text='Scent:')[0].next.next.strip()
except:
d['Scent'] = ''
try:
d['Price'] = soup('div',attrs={'data-test':'product-price'})[0].span.text
except:
d['Price'] = ''
try:
d['Product Form'] = soup('b',text='Product Form:')[0].next.next.strip()
except:
d['Product Form'] = ''
try:
star_rating =soup('div',attrs={'data-ref':'rating-mask'})[0].attrs['style'].split('width:')[1]
d['Star Rating'] = round(float(star_rating.split('%')[0]) / 20, 2)
except:
d['Star Rating']=''
try:
d['Number of Ratings'] = soup('span',attrs={'data-test':'ratingCount'})[0].text.strip()
if d['Number of Ratings'] == 'be the first!':
d['Number of Ratings'] = 0
except:
d['Number of Ratings'] = ''
try:
d['Health Facts'] = soup('b',text='Health Facts:')[0].next.next.strip()
except:
d['Health Facts'] = ''
try:
d['Features'] = soup('b',text='Features:')[0].next.next.strip()
except:
d['Features'] = ''
try:
d['Wellness Standard'] = soup('b',text='Wellness Standard:')[0].next.next.strip()
except:
d['Wellness Standard'] = ''
try:
d['Sustainability Claims'] = soup('b',text='Sustainability Claims:')[0].next.next.strip()
except:
d['Sustainability Claims'] = ''

try:
d['Number of Uses'] = soup('b',text='Number of uses:')[0].next.next.strip()
except:
d['Number of Uses'] = self._parse_num_uses_from_title(complete_product_name)

try:
d['UPC Code'] = soup('b',text='UPC')[0].next.next.next.next.strip()
except:
d['UPC Code'] = ''
d['URL'] = url
d['Category'] = category
d['Package Quantity'] = self._parse_quant_from_title(complete_product_name)
return d
def _parse_capacity_from_title(self,input):
m = re.search('d+(.d)? ?(fl)? ?oz',input,re.IGNORECASE)
if m:
return m.group()
return ''
def _parse_quant_from_title(self,input):
m = re.search('d+ ?pk',input)
if m:
return m.group().split('pk')[0].strip()
return 1
def _parse_num_uses_from_title(self,input):
m = re.search('d+ ?ct',input)
if m:
return m.group().split('ct')[0]
return ''
def scrape_cat(self, cat_name, url):
h = []
self.wd.get(url)
links = self.gather_all_product_links()
for l in links:
print (l)
res = self.scrape_product_page(l, cat_name)
h.append(res)
return h
def gather_all_product_links(self):
links = self.gather_product_links()
while True:
try:
self.goto_next_page()
links.extend(self.gather_product_links())
except:
return [l for l in list(set(links)) if '-category-' not in l]
def main():
h = []
targ = TargetClient()
for cat_name, url in cats:
data = targ.scrape_cat(cat_name, url)
h.extend(data)
return h
write_csv(data)
def write_csv(data):
data = [x for x in data if x]
f = open('output.csv','w')
fields = ['Category','Brand', 'Product', 'Scent', 'Price','Package Quantity','Product Form', 'Capacity', 'Number of Uses', 'Star Rating', 'Number of Ratings', 'Health Facts', 'Features', 'Wellness Standard', 'Sustainability Claims', 'UPC Code', 'URL'] 
dw = csv.DictWriter(f,fieldnames=fields)
dw.writeheader()
dw.writerows(data)

if __name__ == '__main__':
main()

你必须在返回之前write_csv将调用放入main((函数中。

诸如此类:

def main():
h = []
targ = TargetClient()
for cat_name, url in cats:
data = targ.scrape_cat(cat_name, url)
h.extend(data)
write_csv(data)        
return h

希望它有帮助。

最新更新