Python请求无法解码文本



我使用python请求和bs4来抓取网站,但在解码时遇到了一些问题(我认为..(

logurl = 'https://login.flash.co.za/apex/f?p=pwfone:login'
with requests.Session() as s:
s.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
"accept-encoding": "gzip, deflate",
}
response = s.get(logurl)
response.encoding = 'utf-8'
print response.status_code 
# return 200
print (response.content)
# returns: b'<html lang="en-us" xmlns:htmldb="http://htmldb.oracle.com">rn<head>rn<meta http-equiv="x-ua-compatible" content="IE=edge" />rn<title>Pay with Flash Login</title>rn<link rel="stylesheet" href="/i/themes/SPhone/payback/theme_3_1.css" type="text/css" />rnrn<!--[if IE]><link rel="stylesheet" href="/i/themes/SPhone/payback/ie.css" type="text/css" /><![endif]-->rn<link rel="stylesheet" href="/i/app_ui/css/Core.min.css?v=19.1.0.00.15" type="text/css" />n<link rel="stylesheet" href="/i/app_ui/css/Theme-Standard.min.css?v=19.1.0.00.15" type="text/css" />n<link rel="stylesheet" href="/i/libraries/jquery-ui/1.12.1/jquery-ui-apex.min.css?v=19.1.0.00.15" type="text/css" />nrn<link rel="stylesheet" href="/i/legacy_ui/css/5.0.min.css?v=19.1.0.00.15" type="text/css" />nrnrnrnrnrnn<script>nvar apex_img_dir = "/i/", htmldb_Img_Dir = apex_img_dir;n</script>n<script src="/i/libraries/apex/minified/desktop_all.min.js?v=19.1.0.00.15"></script>n<script src="wwv_flow.js_messages?p_app_id=1500&p_lang=en-us&p_version=19.1.0.00.15-94050580561"></script>n<script src="/i/libraries/apex/minified/legacy_pre18.min.js?v=19.1.0.00.15"></script>n<script src="/i/libraries/apex/minified/legacy_18.min.js?v=19.1.0.00.15"></script>n<script src="/i/libraries/jquery-migrate/3.0.1/jquery-migrate-3.0.1.min.js?v=19.1.0.00.15"></script>nrnrnrnrnrnn<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />nrn</head>rn<body ><form action="wwv_flow.accept" method="post" name="wwv_flow" id="wwvFlowForm" novalidate  autocomplete="off">n<input type="hidden" name="p_flow_id" value="1500" id="pFlowId" /><input type="hidden" name="p_flow_step_id" value="101" id="pFlowStepId" /><input type="hidden" name="p_instance" value="5488556618334" id="pInstance" /><input type="hidden" name="p_page_submission_id" value="119891457853323246540979814026507745288" id="pPageSubmissionId" /><input type="hidden" name="p_request" value="" id="pRequest" /><input type="hidden" name="p_reload_on_submit" value="A" id="pReloadOnSubmit" /><input type="hidden" value="119891457853323246540979814026507745288" id="pSalt" /><table border="0" cellpadding="0" cellspacing="0" summary="" id="t18PageBodyHead" width="80%" height="10%" align="center">rn<tr>rn<td align="center" id="t18Logo" valign="top"><br /></td>rn<td id="t18HeaderMiddle"  valign="top" width="100%"><br /></td>rn<td id="t18NavBar" valign="top"><br /><br /></td>rn</tr>rn</table>rn<table border="0" cellpadding="0" cellspacing="0" summary="" id="t18PageBody"  width="80%" height="70%" align="center">rn<tr id="t18tabscolor">rn<td valign="top" id="t18Tabs"  width="110%"></td></tr>rn<tr>rn<td valign="top" id="t18Tabs"></td></tr>rn<tr>rn<td valign="top"></td>rn<td class="t18PageRight"><br /></td>rn</tr>rn<td valign="top" id="t18ContentBody" height="100%" width="100%">rn<div id="t18Messages"><span id="APEX_SUCCESS_MESSAGE" data-template-id="165783099126927032_S" class="apex-page-success u-hidden"></span><span id="APEX_ERROR_MESSAGE" data-template-id="165783099126927032_E" class="apex-page-error u-hidden"></span></div>rn<div id="t18ContentMiddle"><table id="apex_layout_83448022784534430" border="0" class="formlayout"  role="presentation"><tr><td  align="right"><label for="P101_USERNAME" id="P101_USERNAME_LABEL" tabindex="999"><a class="t18OptionalLabelwithHelp" href="javascript:popupFieldHelp('83448127760534444','5488556618334')" tabindex="999">Username</a></label></td><td colspan="2"  align="left"><input type="text"  id="P101_USERNAME" name="P101_USERNAME" class="text_field&#x20;apex-item-text" value="" size="40" maxlength="100"  /></td></tr><tr><td  align="right"><label for="P101_PASSWORD" id="P101_PASSWORD_LABEL" tabindex="999"><a class="t18OptionalLabelwithHelp" href="javascript:popupFieldHelp('83448212820534447','5488556618334')" tabindex="999">Password</a></label></td><td  align="left"><input type="password" name="P101_PASSWORD" size="40" maxlength="100" value=""  id="P101_PASSWORD" class="password&#x20;apex-item-text"  onkeypress="return apex.submit({request:'P101_PASSWORD',submitIfEnter:event})"  /></td></tr><tr><td  align="left"><table class="t18ButtonAlternative1" cellspacing="0" cellpadding="0" border="0"  summary="">n<tr>n<td class="t18L"><img src="/i/themes/theme_18/button_alt1_l.gif" alt="" width="11" height="18" /></td>n<td class="t18C"><a href="javascript:apex.submit(%7Brequest:&#x27;LOGIN&#x27;%7D);">Login</a></td>n<td class="t18R"><img src="/i/themes/theme_18/button_alt1_r.gif" alt="" width="11" height="18" /></td>n</tr>n</table>&nbsp;</td></tr></table></div>rn</td>rn<tr>rn<td valign="top" align="left" ><div></div><div><table border="0" cellpadding="0" cellspacing="0" summary="" id="t18PageFooter" width="100%" style="clear:both">rn<tr>rn<td id="t18Left" valign="top"></td>rn<td id="t18Center" valign="top"></td>rn                <div class="coronaBanner">rn  <div class="coronaBanner__content">rn      <strong>HAVE QUESTIONS ABOUT CORONAVIRUS? VISIT SOUTH AFRICAxe2x80x99S OFFICIAL COVID-19 RESOURCE HERE:<br>rn      <a href="https://sacoronavirus.co.za/">https://sacoronavirus.co.za/</a></strong><br><br>rn    <a class="coronaBanner__websiteLink" href="https://sacoronavirus.co.za/" rel="noopener nofollow" title="SAcoronavirus.co.za">rn      <img class="coronaBanner__websiteLinkImg" border="0" />rn    </a><br><br>rn    <a class="coronaBanner__hotlineLink" href="tel:+27800029999">rn      <strong>Emergency Hotline: 0800 029 999</strong>rn    </a><br>rn    <a class="coronaBanner__whatsappLink" href="https://wa.me/27600123456?text=Hi" rel="noopener nofollow">rn      <strong>WhatsApp Support Line: 0600-123456</strong>rn    </a>rn  </div>rn</div>rn<td id="t18Right" valign="top"><span id="t18Customize"></span><br /></td>rn</tr>rn</table></div></td>rn<td class="t18PageRight"><br /></td>rn</tr>rn<tr>rn</tr>rn</table>rn<br class="t18Break"/>rn<input type="hidden" id="pPageFormRegionChecksums" value="&#x5B;&#x5D;">n<input type="hidden" id="pPageItemsRowVersion" value="" /><input type="hidden" id="pPageItemsProtected" value="CpJ-L5-9OTxTAYpkj4TifA" /></form> rnrnrnnnnnn<script type="text/javascript">napex.jQuery( function() {napex.page.init( this, function() {napex.jQuery.when.apply( apex.jQuery, apex.page.loadingDeferreds ).done(function() {ntry {n(function(){nvar lTimeoutField = document.getElementById("apex_login_throttle_sec"),n    lTimeout      = lTimeoutField ? +lTimeoutField.innerHTML : 0;nif (lTimeout) {n    var lTimer = window.setInterval (n        function() {n            if (lTimeout > 0) {n                lTimeoutField.innerHTML = lTimeout;n                lTimeout--;n            } else {n                window.clearInterval(lTimer);n                var lDiv = document.getElementById("apex_login_throttle_div");n                if (lDiv) {n                    lDiv.parentNode.removeChild(lDiv);n                    return true;n                }n            }n        },n        1000 );n}})();nnnapex.item( 'P101_USERNAME' ).setFocus();n} finally {napex.event.trigger(apex.gPageContext$,'apexreadyend');n}n});n});n});</script>nrn</body>rn</html>'
print (response.text)
#return nothing
soup = BeautifulSoup(response.content ,"html5lib") #returns nothing
soup = BeautifulSoup(response.text ,"html5lib") #returns nothing
soup = BeautifulSoup(response.content ,"html.parser") #returns nothing
soup = BeautifulSoup(response.text ,"html.parser") #returns nothing

奇怪的是,这在我的本地环境(python 3.7,甚至python 2.7(上运行良好,但在我使用的服务器上却不行,因为它使用python 3.5。

如何正确解码bs4的响应?

无法使用运行Python3.5.9:的Docker容器重现错误

#test.py
import requests
import urllib3
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'
try:
requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
except AttributeError:
# no pyopenssl support used / needed / available
pass
logurl = 'https://login.flash.co.za/apex/f?p=pwfone:login'
response = requests.get(logurl)
soup = BeautifulSoup(response.content ,"html.parser")
div = soup.find("div", class_="coronaBanner__content")
print(div)

输出:

<div class="coronaBanner__content">
<strong>HAVE QUESTIONS ABOUT CORONAVIRUS? VISIT SOUTH AFRICA’S OFFICIAL COVID-19 RESOURCE HERE:<br/>
...

使用的版本:

beautifulsoup4==4.9.0
bs4==0.0.1
certifi==2020.4.5.1
chardet==3.0.4
idna==2.9
requests==2.23.0
soupsieve==2.0
urllib3==1.25.9

下面是3.7Dockerfile,让您开始使用(使用一个空文件夹并将Dockerfilerequirements.txttest.py:

FROM python:3.7-slim-buster
COPY /requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt
COPY . /app
WORKDIR /app
CMD ["python", "test.py"]

requirements.txt

beautifulsoup4
requests

使用:

docker build -t test .
docker run test

您不需要在标头上设置accept-encoding。以下对我有效:

import requests
from bs4 import BeautifulSoup
logurl = 'https://login.flash.co.za/apex/f?p=pwfone:login'
with requests.Session() as s:
s.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = s.get(logurl)
soup = BeautifulSoup(response.content ,"html5lib")
all_images = soup.find_all("img")
for img in all_images:
print(img['src'])

/i/themes/theme_18/button_alt1_l.gif
/i/themes/theme_18/button_alt1_r.gif

演示

最新更新