Python-docx:查找Word文档中的所有占位符数字并将其替换为随机数字



我在查找和替换Word文件段落中出现的多个占位符时遇到问题。这是一本游戏书,所以我试图为起草这本书时使用的占位符提供随机输入数字。

所有占位符都以"开头#"(例如#1-5、#22-1等(。与第一个条目(始终为"1"(一样,集合号不具有"1"#"前缀占位符条目通过在元组中压缩以供引用,与随机对应项作为元组进行配对。

这一切都适用于标题,因为它是一个直接的一对一的段落交换,按顺序。问题是当我在常规段落(代码的倒数第二位(中进行迭代时。它似乎只替换了前八个数字,然后停止了。我试过设置一个循环,但似乎没有帮助。不确定我错过了什么。代码如下。

编辑:以下是两个列表和引用元组的设置方式。在这个测试中,只设置了第一个条目,没有段落中的引用。所有其他条目都将被随机分组,并在段落中替换。

entryWorking:['#1-1'、'#1-2'、'#3-3'、'#1-4'、'#5-5'、'#2-6'、'#1-7'、'#1-8'、'#2'、'[2-1'、'#2-2'、'#2-3'、'#2-4'、'#1-5'、'#2-6'、'[2-7'、'#3'、'17'、'#3'、'3-1'、'3-2'、'3-3'、'#4-4'、'#5-5','#3-6'、'3-8'、'[3-9']

入口编号:[2','20','12','27','23','4','11','16','26','7','25','5','3','15','17','6','18','22','10','21','19','13','28','8','14','9','24']

参考:("#1-1','2'(,("#1-2','20'(,("#1-3','12'(,"#1-4','27'(,"#1-5','23'(,[#1-6','4'],(#1-7','11'(,'#1-8','16',('2','26'(,'#2-1','7'(,,('#2-2','25'(,2-5','15'(,('#2-6','17'(,[#2-7','6'],[#16','18'],('#17','22'(,'#3','10',('#3-1','21'(,'#3-2','19',(#3-3','13'(,[3-4','28',('3-5','8'(,('#3-8','9'(,('#3-9','24'(

谢谢你的协助。

import sys, os, random
from docx import *
entryWorking = [] # The placeholder entries created for the draft gamebook

# Identify all paragraphs with a specific heading style (e.g. 'Heading 2')
def iter_headings( paragraphs, heading ) :
for paragraph in paragraphs :
if paragraph.style.name.startswith( heading ) :
yield paragraph

# Open the .docx file
document = Document( 'TestFile.docx' )

# Search document for unique placeholder entries (must have a unique heading style)
for heading in iter_headings( document.paragraphs, 'Heading 2' ) :
entryWorking.append( heading.text )

# Create list of randomized gamebook entry numbers
entryNumbers = [ i for i in range( len ( entryWorking ) + 1 ) ]
# Remove unnecessary entry zero (extra added above to compensate)
entryNumbers.remove( 0 )
# Convert to strings
entryNumbers = [ str( x ) for x in entryNumbers ]

# Identify pre-set entries (such as Entry 1), and remove from both lists
# This avoids pre-set numbers being replaced (i.e. they remain as is in the .docx)
# Pre-set entry numbers must _not_ have the "#" prefix in the .docx
for string in entryWorking :
if string[ 0 ] != '#' :
entryWorking.remove( string )
if string in entryNumbers :
entryNumbers.remove( string )
# Shuffle new entry numbers
random.shuffle( entryNumbers )

# Create tuple list of placeholder entries paired with random entry
reference = tuple( zip( entryWorking, entryNumbers ) )

# Replace placeholder headings with assigned randomized entry
for heading in iter_headings( document.paragraphs, 'Heading 2' ) :
for entry in reference :
if heading.text == entry[ 0 ] :
heading.text = entry[ 1 ]

# Search through paragraphs for placeholders and replace with randomized entry
for paragraph in document.paragraphs :
for run in paragraph.runs :
for entry in reference :
if run.text == entry[ 0 ] :
run.text = entry [ 1 ]

# Save the new document with final entries
document.save('Output.docx')

在Word中,在文本中的任意位置运行break:

  • python docx中的运行级别内容是什么?。

  • 如何用python 有效替换word文档中的句子

你可能对这个答案中的链接感兴趣,这些链接展示了在一般情况下做这类事情所需的(令人惊讶的复杂(工作:

如何使用python-docx替换Word文档中的文本并保存

有几个段落级函数可以很好地完成这项工作,可以在python docx的GitHub网站上找到。

这将用替换字符串替换regex匹配。替换字符串的格式将与匹配字符串的第一个字符相同。

这将隔离一个运行,使得一些格式可以应用于该单词或短语,比如突出显示";foobar";在文本中,或者将其加粗或以更大的字体显示。

幸运的是,它通常是可复制的,结果很好:(

感谢scanny的协助!

我在使其工作后发现的最后一个问题是添加一个"#"后缀,以确保它们是唯一的(例如,#2的随机条目没有被代入#2-1(。

下面的工作代码。

import sys, os, random, re
from docx import *

# Identify all paragraphs with a specific heading style (e.g. 'Heading 2')
def iter_headings( paragraphs, heading ) :
for paragraph in paragraphs :
if paragraph.style.name.startswith( heading ) :
yield paragraph

def paragraph_replace_text( paragraph, regex, replace_str ) : # Credit to scanny on GitHub
"""Return `paragraph` after replacing all matches for `regex` with `replace_str`.
`regex` is a compiled regular expression prepared with `re.compile(pattern)`
according to the Python library documentation for the `re` module.
"""

# --- a paragraph may contain more than one match, loop until all are replaced ---
while True :
text = paragraph.text

match = regex.search( text )
if not match :
break

# --- when there's a match, we need to modify run.text for each run that
# --- contains any part of the match-string.
runs = iter( paragraph.runs )
start, end = match.start(), match.end()

# --- Skip over any leading runs that do not contain the match ---
for run in runs :
run_len = len( run.text )
if start < run_len :
break
start, end = start - run_len, end - run_len

# --- Match starts somewhere in the current run. Replace match-str prefix
# --- occurring in this run with entire replacement str.
run_text = run.text
run_len = len( run_text )
run.text = "%s%s%s" % ( run_text[ :start ], replace_str, run_text[ end: ] )
end -= run_len  # --- note this is run-len before replacement ---
# --- Remove any suffix of match word that occurs in following runs. Note that
# --- such a suffix will always begin at the first character of the run. Also
# --- note a suffix can span one or more entire following runs.
for run in runs :  # --- next and remaining runs, uses same iterator ---
if end <= 0 :
break
run_text = run.text
run_len = len( run_text )
run.text = run_text[ end: ]
end -= run_len
# --- optionally get rid of any "spanned" runs that are now empty. This
# --- could potentially delete things like inline pictures, so use your judgement.
# for run in paragraph.runs :
#     if run.text == "" :
#         r = run._r
#         r.getparent().remove( r )
return paragraph

""" NOTE: Replace 'Doc.docx' with your filename """
# Open the .docx file
document = Document( 'Doc.docx' )

# Search document for unique placeholder entries (must have a unique heading style)
entryWorking = [] # The placeholder entries created for the draft gamebook

""" NOTE: Replace 'Heading 2' with your entry number header """
for heading in iter_headings( document.paragraphs, 'Heading 2' ) :
entryWorking.append( heading.text )

# Create list of randomized gamebook entry numbers
entryNumbers = [ i for i in range( len ( entryWorking ) + 1 ) ]

# Remove unnecessary entry zero (extra added above to compensate)
entryNumbers.remove( 0 )

# Convert to strings
entryNumbers = [ str( x ) for x in entryNumbers ]

# Identify pre-set entries (such as Entry 1), and remove from both lists
# This avoids pre-set numbers being replaced (i.e. they remain as is in the .docx)
# Pre-set entry numbers must _not_ have the "#" prefix in the .docx
for string in entryWorking :
if string[ 0 ] != '#' :
entryWorking.remove( string )
if string in entryNumbers :
entryNumbers.remove( string )

# Shuffle new entry numbers
random.shuffle( entryNumbers )

# Create tuple list of placeholder entries paired with random entry
reference = tuple( zip( entryWorking, entryNumbers ) )

# Replace placeholder headings with assigned randomized entry
for heading in iter_headings( document.paragraphs, 'Heading 2' ) :
for entry in reference :
if heading.text == entry[ 0 ] :
heading.text = entry[ 1 ]

for paragraph in document.paragraphs :
for entry in reference :
if entry[ 0 ] in paragraph.text :
regex = re.compile( entry[ 0 ] )
paragraph_replace_text(paragraph, regex, entry[ 1 ])

# Save the new document with final entries
document.save('Output.docx')

最新更新