使用Python中的BeautifulSoup添加超链接到HTML,使用锚文本和URL存储在CSV文件中



我想写一个程序在python美丽的汤在html中使用csv文件与锚文本和超链接超链接字

包含2列的CSV文件:

tbody> <<tr>Bing雅虎
anchor_text hyperlink
谷歌https://www.google.com
https://bing.com
https://yahoo.com
积极竞选https://activecampaign.com

你应该尝试在外部循环中使用锚/链接,然后在内部循环中分解匹配的字符串:

import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4 import element as bs4_element
import csv
html_doc = """
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another multi word Active Campaign.</p>
<!-- /wp:paragraph -->
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# read the CSV file with anchor text and hyperlinks
with open('file.csv', 'r') as csv_file:
reader = csv.reader(csv_file)
hyperlinks = dict(reader)

# from bs4 import element as bs4_element
be_navStr = bs4_element.NavigableString

hList = [
(anchor_text.strip(), hyperlink.strip()) for
anchor_text, hyperlink in hyperlinks.items()
if anchor_text.strip() and hyperlink.strip() # no blanks
]

print('#'*35, 'OLD', '#'*35, 'n')
print(soup, 'n')
print('#'*75, 'nnn')
for txt, link in hList:
navStrs = [
d for d in soup.descendants if type(d) == be_navStr 
# and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
# and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
# and (' '+txt.lower()+' ') in (' '+d.string.strip().lower()+' ')
and (' '+re.sub('W+',' ',txt.lower())+' ') in (' '+re.sub('W+',' ',d.string.strip().lower())+' ') # Handles special characters like ?.!
]


for ns in navStrs: 
# tLen, remStr = len(txt), f' {ns.get_text().strip()} '
tLen, remStr = len(txt), f' {ns.string} '
remLen = len(remStr)
# tLen, remStr = len(txt), f' {ns.text.strip()} '
if remStr[1:-1].lower() == txt.lower():
# to skip if it's already a hyperlink
if ns.parent.name == 'a': 
ns.parent['href'] = link # comment if you dont want to replace/update link
continue 
# Skip creating nested hyperlinks inside existing hyperlinks       
if ns.parent.name == 'a': 
continue 

i = 0        
while ' '+re.sub('W+',' ',txt.lower())+' ' in re.sub('W+',' ',remStr.lower()) and remStr.lower().find(f'{txt.lower()}') > -1:

#print(txt.lower())
#print(re.sub('W+',' ',remStr.lower()))
sInd = remStr.lower().find(f'{txt.lower()}')
#print(remStr.lower())
#print(sInd)
hlTag = soup.new_tag('a', href=link)
hlTag.append(remStr[sInd:sInd + tLen])
#print(hlTag)
if i == 0:
newCont = [remStr[1:sInd], hlTag]
else:
newCont = [remStr[:sInd], hlTag]
#print(newCont)
for addn in newCont: ns.insert_before(addn)
#print(soup)
remStr = remStr[sInd + tLen:remLen-1]
#print(remStr)
i += 1
ns.replace_with(remStr)
#print(soup)
print('#'*35, 'NEW', '#'*35, 'n')
print(soup, 'n')
print('#'*75)

印刷输出:

################################### OLD ################################### 
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word Active Campaign.</p>
<!-- /wp:paragraph --> 
########################################################################### 

################################### NEW ################################### 
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another <a href="https://www.google.com">Google</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase <a href="https://bing.com">bing</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word <a href="https://activecampaign.com">Active Campaign</a>.</p>
<!-- /wp:paragraph --> 
###########################################################################

这应该工作,即使有多个匹配在同一个字符串,只要他们不重叠(如"Google Chrome">"Chrome Beta">)

最新更新