如何使用python在word文档中查找和替换指定标签之间的文本?



我想打开Word并保持Word文档的原始格式,并且只使用python对开始和结束标记之间的文本应用粗体。

我有一个代码,只在一个段落中保留最后发现的文本,但我需要在每个段落中加粗所有发现。我使用的代码在

下面
import docx
# Open the Word document
doc = docx.Document('sample1.docx')
# Define the starting and end tags
start_tag = '<strong>'
end_tag = '</strong>'
# Loop through all paragraphs in the document
for para in doc.paragraphs:
# Get the text of the paragraph
text = para.text
# Find all occurrences of text surrounded by starting and end tags
start_index = 0
while True:
start_index = text.find(start_tag, start_index)
print(start_index)
if start_index == -1:
break
end_index = text.find(end_tag, start_index + len(start_tag))
if end_index == -1:
break

# Get the text between the starting and end tags
bold_text = text[start_index + len(start_tag):end_index]

# Replace the text in the paragraph with its bolded value
para.clear()
para.add_run(text[:start_index])
para.add_run(text[start_index + len(start_tag):end_index]).bold = True
para.add_run(text[end_index + len(end_tag):])

# Update the start index for the next search
start_index = end_index + len(end_tag)
# print(start_index)
# Save the modified document
doc.save('sample_modified.docx')

从word文件中取出的结果如下:

<strong>NOTE:</strong> You <strong>could</strong> obviously do the same <strong>with</strong> ANY style that has a one-to-one mapping from.

此代码仅将最后发现的'has'文本作为粗体并提取标记。但是,我需要对每个发现执行相同的操作。你能帮我解决这个问题吗?

你需要分解文本,然后重建在标签中保留这些部分;
更新了注释中描述的附加格式代码。

import docx
import re

# Open the Word document
doc = docx.Document('sample.docx')
tags_list = ['strong', 'em', 'strong em']
for para in doc.paragraphs:
para_text = para.text
if para_text == '':
continue
### Combine the strong em into a single tag
text = para_text.replace('<strong><em>', '<strong em>')
text = text.replace('</em></strong>', '</strong em>')
### Create a new paragraph
newPara = para.insert_paragraph_before()
### Remove the existing so we don't have duplicates
para.clear()
### Get all indexex for all tags
indexes = []
pattern = '<('
for item in tags_list:
pattern += f"{item}|"
pattern = pattern[:-1] + r")>[^<]*</1>"
for match in re.finditer(pattern, text, re.DOTALL):
indexes += match.regs[0]
### Split text into its parts
parts = [text[i:j] for i, j in zip(indexes, indexes[1:] + [None])]
if indexes[0] != 0:
parts.insert(0, text[:indexes[0]])
### Recombine text applying necessary formatting
for part in parts:
tag = re.findall('<[^<]+?>', part)[0] if part[0] == '<' else None
if tag:
strip_text = re.sub('<[^<]+?>', '', part)
run = newPara.add_run(strip_text)
if "<strong" in tag:
run.bold = True
if "em>" in tag:
run.italic = True
else:
newPara.add_run(part.replace('n', ''))
### Remove added empty paragraphs 
for para in doc.paragraphs:
if len(para.text) == 0:
p = para._element
p.getparent().remove(p)
p._p = p._element = None

doc.save('sample_modified.docx')

前后第三个样本数据

我基于moken改进的代码的最终版本的回答如下。使用Python win32com库删除Word文档中的空行,问题得到解决。

import docx
import re
import numpy as np
import inspect, os
import win32com.client as win32
import win32com.client
def split_tagged_sections(text, start_tag, end_tag):

if '<strong>' not in text:
# Split the text by start and end tags
segments = re.split(f'({start_tag}|{end_tag})', text)
# Initialize the result array and a variable to track opened tags
result = []
opened_tags = 0
# Iterate through the segments
for segment in segments:
if not segment:
continue
# If we encounter a start tag, increase the count of opened_tags
if segment == start_tag:
opened_tags += 1
if opened_tags == 1:
result.append(start_tag)
else:
result[-1] += start_tag
# If we encounter an end tag, decrease the count of opened_tags
elif segment == end_tag:
opened_tags -= 1
result[-1] += end_tag
# If we're inside a tagged section, append the segment to the last opened tag
elif opened_tags > 0:
result[-1] += segment
# Otherwise, simply append the segment to the result array
else:
result.append(segment)

elif '<strong>' in text:
start_tag_b = '<strong>'
end_tag_b = '</strong>'
# Split the text by start and end tags
segments = re.split(f'({start_tag_b}|{end_tag_b})', text)
# Initialize the result array and a variable to track opened tags
result = []
opened_tags = 0
# Iterate through the segments
for segment in segments:
if not segment:
continue
# If we encounter a start tag, increase the count of opened_tags
if segment == start_tag_b:
opened_tags += 1
if opened_tags == 1:
result.append(start_tag_b)
else:
result[-1] += start_tag_b
# If we encounter an end tag, decrease the count of opened_tags
elif segment == end_tag_b:
opened_tags -= 1
result[-1] += end_tag_b
# If we're inside a tagged section, append the segment to the last opened tag
elif opened_tags > 0:
result[-1] += segment
# Otherwise, simply append the segment to the result array
else:
result.append(segment)          
return result
def remove_empty_lines():
remove_empty_lines.doc_name  = doc_name

app = win32com.client.gencache.EnsureDispatch("Word.Application")
script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
remove_empty_lines.doc_name = doc_name
file_path = os.path.join(script_dir, remove_empty_lines.doc_name)
word = win32.Dispatch("Word.Application")
doc = app.Documents.Open(file_path)
# Identify and remove empty lines
# Replace two consecutive paragraph marks with one (converts empty lines to non-empty)
find = app.Selection.Find
find.ClearFormatting()
find.Replacement.ClearFormatting()
find.Text = "^p^p"
find.Replacement.Text = "^p"
find.Execute(Replace=win32.constants.wdReplaceAll)

# Replace two consecutive line breaks with one (converts empty lines to non-empty)
find.Text = "^l^l"
find.Replacement.Text = "^l"
find.Execute(Replace=win32.constants.wdReplaceAll)
# Save and close the document
doc.Save()
doc.Close()
# Quit the Word application
app.Quit()

def tag_resolve():
tag_resolve.doc_name  = doc_name
doc = docx.Document(tag_resolve.doc_name)
# doc = docx.Document('sample.docx')
# Define the starting and end tags
start_tag_b = '<strong>'
end_tag_b = '</strong>'
start_tag_i = '<em>'
end_tag_i = '</em>'

tagged_and_non_tagged_texts = []
column_string = []
text_i=[]
empty_list=[]
for para in doc.paragraphs:
para_text = para.text
# print(para_text)
if para_text == '':
continue
if start_tag_b in para_text:
### Create a new paragraph 
newPara = para.insert_paragraph_before(text=None, style=None)
para.clear()  # Remove the existing so we don't have duplicates
### Get a list of indexes for the start/end tag ranges
indexes = [item for t in zip(
[s.start() for s in re.finditer(start_tag_b, para_text)],
[s.start() + len(end_tag_b) for s in re.finditer(end_tag_b, para_text)],
) for item in t]
# print(indexes)

if indexes[0] != 0:
indexes.insert(0, 0)
### Divide the text into bold/not bold sections
sections = [para_text[i:j] for i, j in zip(indexes, indexes[1:] + [None])]
# print(sections)


for text_i in sections:
# print(text_i)
tagged_and_non_tagged_texts = split_tagged_sections(text_i,'<em>','</em>')
column_string = tagged_and_non_tagged_texts
for i in range(len(column_string)): 
empty_list = [column_string[i]]
# print(empty_list)

row_array = np.array(empty_list)
row_array = str(row_array).strip('[]')
row_array = row_array[1:-1]            
# print(row_array)  

if start_tag_b in row_array:
string2bold2 = re.sub('<[^<]+?>', '', row_array)
# print(newPara.text)
run = newPara.add_run(string2bold2)
run.bold = True

if(start_tag_i in row_array):
run.italic = True

elif(start_tag_i in row_array and start_tag_b not in row_array):
string2ital = re.sub(start_tag_i, '', row_array)
string2ital2 = re.sub(end_tag_i, '', string2ital)
run = newPara.add_run(string2ital2)
run.italic = True

else:
newPara.add_run(row_array.replace('n', ''))

# doc.save('sample_modified.docx')
doc.save(tag_resolve.doc_name)        

remove_empty_lines()


doc_name = 'sample1.docx'
# doc_name = ''
def main():
tag_resolve();  

应用此代码后,处理后的结果文本如下所示,并附带原始文本。

https://i.stack.imgur.com/c3SeJ.png

最新更新