无效的 URL:未提供架构。也许你的意思是 http://?



我想编写一个用于拆分url的python3脚本,并使用python中的请求模块检查字符串

但是一些url重定向通过刷新元标签我想python遵循这个链接和另一个问题是:当使用for循环for(!)向url发送请求时,我收到

<无效URL>误差

如果你看下面的代码,你会更好地发现我的意思

fopen2 = open("clean url.txt", "r")
splurl = fopen2.read().split('n')
urlcln = []
urlcln2 = []
print(splurl)
for i in splurl:
getthis = requests.get(i)
parserres = BeautifulSoup(getthis.text, 'html.parser')
print(parserres)
if "<title>" in str(parserres):
print('yes')
else:
print('no')

这是完整的源代码…

import time
import sys
import re
from tqdm.auto import tqdm
import requests
from bs4 import BeautifulSoup
# from colorama import Fore, Back, Style
try:
def finish():
print("""
==============================
Duplicated URLs Remover With it Fathers!
Developed Version ~ 1.0.3 By Ehsan Abafat
_____ _                          _    _            __       _
| ____| |__  ___  __ _ _ __      /   | |__   __ _ / _| __ _| |_
|  _| | '_ / __|/ _` | '_     / _  | '_  / _` | |_ / _` | __|
| |___| | | __  (_| | | | |  / ___ | |_) | (_| |  _| (_| | |_
|_____|_| |_|___/__,_|_| |_| /_/   __.__/ __,_|_|  __,_|__|
=============================
clean URLs successfully!
time that spent for this proccess:
""")
print(toc - tic, "S")
if sys.argv[1] == '-h':
print(100*"*")
print('''
=> ~ Usage : Put dirty URLs in "old url.txt" file and call python3 source.py -?
-s : Fully Clean URLs
-d : Clean URLs with keeping "/"
-f : Clean Duplicated URLs Example : 1 2 3 2 => 1 2 3
=> ~ Developed Version ~ 1.0.3 By Ehsan Abafat
''')
print(100*"*")
elif sys.argv[1] == '-s':
tic = time.time()
f = open("old url.txt", "r")
flisted = f.read().lower().replace('https', 'http').replace(
'http://', '').replace('www.', '')
SmartRemover = re.sub("/(w+)?", "", flisted).split('n')
listurl = []
dupurl = ["n"]
fullclean = []
print('n Getting Lines... n')
for i in tqdm(SmartRemover):
if i in listurl:
dupurl.append(i.strip())
else:
listurl.append(i.strip())
print('n Cleaning... n')
for i in tqdm(listurl):
if i not in dupurl:
fullclean.append(i)
f.close()
flast = open("clean url.txt", "w")
for i in fullclean:
if(i != 'n' and i != 's' and i != '' and len(i) > 2):
flast.write('http://'+str(i)+'n')
toc = time.time()
finish()
flast.close()
elif sys.argv[1] == '-d':
tic = time.time()
f = open("old url.txt", "r")
flisted = f.read().lower().replace('https', 'http').replace(
'http://', '').replace('www.', '').split()
listurl = []
dupurl = ["n"]
fullclean = []
print('n Getting Lines... n')
for i in tqdm(flisted):
if i in listurl:
dupurl.append(i.strip())
else:
listurl.append(i.strip())
print('n Cleaning... n')
for i in tqdm(listurl):
if i not in dupurl:
fullclean.append(i)
f.close()
flast = open("clean url.txt", "w")
for i in fullclean:
if(i != 'n' and i != 's' and i != '' and len(i) > 2):
flast.write('http://'+str(i)+'n')
toc = time.time()
finish()
flast.close()
elif sys.argv[1] == '-f':
tic = time.time()
f = open("old url.txt", "r")
flisted = f.read().lower().replace('https', 'http').replace(
'http://', '').replace('www.', '').split()
listurl = []
dupurl = ["n"]
fullclean = []
print('n Getting Lines... n')
for i in tqdm(flisted):
if i in listurl:
dupurl.append(i.strip())
else:
listurl.append(i.strip())
print('n Cleaning... n')
flast = open("clean url.txt", "w")
for i in tqdm(listurl):
print(i)
if(i != 'n' and i != 's' and i != '' and len(i) > 2):
flast.write('http://'+str(i)+'n')
f.close()
toc = time.time()
finish()
flast.close()
else:
print('unknown command! use python3 source.py -h')
if len(sys.argv) == 2:
pass
elif len(sys.argv) == 3:
telerikuiVul = '{ "message" : "RadAsyncUpload handler is registered succesfully, however, it may not be accessed directly." }'
telerikBugCheckADR = "/Telerik.Web.UI.WebResource.axd?type=rau"
fopen2 = open("clean url.txt", "r")
splurl = fopen2.read().split('n')
urlcln = []
urlcln2 = []
print(splurl)
for i in splurl:
getthis = requests.get(i)
parserres = BeautifulSoup(getthis.text, 'html.parser')
print(parserres)
if "<title>" in str(parserres):
print('yes')
else:
print('no')
else:
if(sys.argv[1] != '-h'):
print("use '<Python3 source.py -h>' command")
else:
print('You are see Usage of This Script!')
except Exception as e:
print(e)

您的url列表中没有"http:/"www.address.com应为http://www.address.com

这应该修复它

for i in splurl:
getthis = requests.get("http://" + i)
parserres = BeautifulSoup(getthis.text, 'html.parser')
print(parserres)
if "<title>" in str(parserres):
print('yes')
else:
print('no')

相关内容

最新更新