beautifulsoup解析var而不是dict的标准json



如何转换为json、javascript var到dict

html代码

<script type="text/javascript">
var _admin_pv_props = {
from_page: 'post',
is_block_editor: 'true',
source: 'wp-admin',
blog_id: '74229154',
user_type: ''
};
_tkq = window._tkq || [];
_tkq.push( [ 'identifyUser', 70966694, 'dgkug' ] );
_tkq.push( [ 'recordEvent', 'wpcom_admin_page_view', _admin_pv_props ] );
</script>

我想获取var_adminpv_props我的代码

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')
pattern = re.compile(r'var _admin_pv_props = .*?;$', re.MULTILINE | re.DOTALL)
script = soup.find("script", text=pattern)
blog_str = pattern.search(script.text).group(0)
blog_str = blog_str.replace('var _admin_pv_props = ', '').replace(';', '')
print(blog_str)
{
from_page: 'post',
is_block_editor: 'true',
source: 'wp-admin',
blog_id: '74229154',
user_type: ''
}

但是博客str不是标准的json

尝试:

import re
from ast import literal_eval
txt = """
<script type="text/javascript">
var _admin_pv_props = {
from_page: 'post',
is_block_editor: 'true',
source: 'wp-admin',
blog_id: '74229154',
user_type: ''
};
_tkq = window._tkq || [];
_tkq.push( [ 'identifyUser', 70966694, 'dgkug' ] );
_tkq.push( [ 'recordEvent', 'wpcom_admin_page_view', _admin_pv_props ] );
</script>
"""
data = re.search(r"_admin_pv_props = ({.*?});", txt, flags=re.S).group(1)
data = re.sub(r"([^s]+): ", r"'1': ", data)
data = literal_eval(data)
print(data)

打印:

{
"from_page": "post",
"is_block_editor": "true",
"source": "wp-admin",
"blog_id": "74229154",
"user_type": "",
}

最新更新