刮网/Zomato刮网美容汤



我尝试了web抓取引用https://datascienceplus.com/zomato-web-scraping-with-beautifulsoup-in-python/

刚刚复制&在网站上粘贴了代码,但在第二步出现错误。

import requests
from bs4 import BeautifulSoup
#Used headers/agent because the request was timed out and asking for an agent. 
#Using following code we can fake the agent.
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)

下面是错误代码。我用的是木星笔记本。你知道我为什么得到这个吗?我对此完全陌生,甚至不完全理解这些变量(头、响应(的作用

---------------------------------------------------------------------------
RemoteDisconnected                        Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599                                                   body=body, headers=headers,
--> 600                                                   chunked=chunked)
601 
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383                     # otherwise it looks like a programming error was the cause.
--> 384                     six.raise_from(e, None)
385         except (SocketTimeout, BaseSSLError, SocketError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379                 try:
--> 380                     httplib_response = conn.getresponse()
381                 except Exception as e:
~/anaconda3/lib/python3.7/http/client.py in getresponse(self)
1320             try:
-> 1321                 response.begin()
1322             except ConnectionError:
~/anaconda3/lib/python3.7/http/client.py in begin(self)
295         while True:
--> 296             version, status, reason = self._read_status()
297             if status != CONTINUE:
~/anaconda3/lib/python3.7/http/client.py in _read_status(self)
264             # sending a valid response.
--> 265             raise RemoteDisconnected("Remote end closed connection without"
266                                      " response")
RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
ProtocolError                             Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448                     retries=self.max_retries,
--> 449                     timeout=timeout
450                 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637             retries = retries.increment(method, url, error=e, _pool=self,
--> 638                                         _stacktrace=sys.exc_info()[2])
639             retries.sleep()
~/anaconda3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
367             if read is False or not self._is_method_retryable(method):
--> 368                 raise six.reraise(type(error), error, _stacktrace)
369             elif read is not None:
~/anaconda3/lib/python3.7/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
684         if value.__traceback__ is not tb:
--> 685             raise value.with_traceback(tb)
686         raise value
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599                                                   body=body, headers=headers,
--> 600                                                   chunked=chunked)
601 
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
383                     # otherwise it looks like a programming error was the cause.
--> 384                     six.raise_from(e, None)
385         except (SocketTimeout, BaseSSLError, SocketError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379                 try:
--> 380                     httplib_response = conn.getresponse()
381                 except Exception as e:
~/anaconda3/lib/python3.7/http/client.py in getresponse(self)
1320             try:
-> 1321                 response.begin()
1322             except ConnectionError:
~/anaconda3/lib/python3.7/http/client.py in begin(self)
295         while True:
--> 296             version, status, reason = self._read_status()
297             if status != CONTINUE:
~/anaconda3/lib/python3.7/http/client.py in _read_status(self)
264             # sending a valid response.
--> 265             raise RemoteDisconnected("Remote end closed connection without"
266                                      " response")
ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
ConnectionError                           Traceback (most recent call last)
<ipython-input-3-5f0caa95c89a> in <module>
2 #Using following code we can fake the agent.
3 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
----> 4 response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)
~/anaconda3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73 
74     kwargs.setdefault('allow_redirects', True)
---> 75     return request('get', url, params=params, **kwargs)
76 
77 
~/anaconda3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58     # cases, and look like a memory leak in others.
59     with sessions.Session() as session:
---> 60         return session.request(method=method, url=url, **kwargs)
61 
62 
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531         }
532         send_kwargs.update(settings)
--> 533         resp = self.send(prep, **send_kwargs)
534 
535         return resp
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644 
645         # Send the request
--> 646         r = adapter.send(request, **kwargs)
647 
648         # Total elapsed time of the request (approximately)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
496 
497         except (ProtocolError, socket.error) as err:
--> 498             raise ConnectionError(err, request=request)
499 
500         except MaxRetryError as e:
ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

Server似乎不喜欢您提供的用户代理。您可以缩短如下:

import requests
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)

最新更新