抓取电话并将url从CSV压缩到CSV



我需要抓取存储在CSV中的url列表并导出到另一个CSV。我一定是弄错了,因为我不能运行它。所以如果有人能帮助我,我很感激。

我是Python的新手,也统一了一些代码,所以我有一些问题来确定问题在哪里。我混合了一个导入CSV的代码和另一个需要字符串搜索的代码。

import scrapy
from scrapy import Spider
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen,urlparse, Request,HTTPError
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl

下面是我目前得到的代码:


phn_1 = []
zipcode_1 = []
err_msg_zipcode = []
err = []

class Spider:
name = 'spider'

# read csv with just url per line
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]

def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request

def parse(self, response):

s = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()

df2=pd.DataFrame()

phn_1 = []    #store all the extracted Phn numbers in a List
mail_1 = []    #store all the extracted Zipcode in a List
for line in df2.iterrows():  # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name 
s = f.read()
phone = re.findall(r'd{3}-d{3}-d{4}', s, re.MULTILINE)
zipcode = re.findall(r'(?<=, [A-Z]{2} )d{5}', s, re.MULTILINE)

if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))

else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)

if len(zipcode) == 0:
print("No zipcode found.")
err_msg_zipcode = "No zipcode address found."
zipcode_1.append((url_name,err_msg_zipcode))

else:
count = 1
for item in zipcode:
mail_1.append((url_name,item))
count += 1
print(mail_1)

except BadStatusLine: # Catch if invalid url names exist
print("could not fetch %s" % url_name)

except urllib3.request.HTTPError as err: # catch HTTP 404 not found error
if err == 404:
print("Received HTTPError on %s" % url_name)


df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()

df_p = pd.DataFrame(phn_1,columns=['URL','Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL', 'Phone_No'], keep='first') #remove duplicates

df_m = pd.DataFrame(zipcode_1,columns=['URL','Zipcode']) # Dataframe for url and Zipcode
df_mail = df_m.drop_duplicates(subset=['URL','Zipcode'], keep='first') #remove duplicates

df_final = pd.merge(df_phn,df_mail, on = 'URL', how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'], as_index=False)
df_final.to_csv('result_contact.csv', index=False, encoding='utf-8')

#convert the csv output to json
with open('result_contact.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)

谢谢! !

我看到的一个明显的错误是:

request = Request(url = self.start_urls, callback=self.parse)

url应该是字符串,但是您发送的是一个列表。如果要发送多个请求,则需要使用循环。由于您已经设置了start_urls并使用了parse回调,因此您不需要覆盖start_requests。默认实现应该会处理它。

您可以考虑在__init__方法中设置start_urls

最新更新