如何在嵌套的 Expr pyparing 中保留空格



>我有这样的维基文本

data = """
{{hello}}

{{hello world}}
{{hello much { }}
{{a {{b}}}}

{{a

td {

}
{{inner}}
}}
"""

我想提取其中的宏 宏是括在{{}}之间的文本

所以我尝试使用 nestedExpr

from pyparsing import *
import pprint
def getMacroCandidates(txt):
candidates = []
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.copy()):
if opener == closer:
raise ValueError("opening and closing strings cannot be the same")
if content is None:
if isinstance(opener,str) and isinstance(closer,str):
if ignoreExpr is not None:
content = (Combine(OneOrMore(~ignoreExpr + 
~Literal(opener) + ~Literal(closer) +
CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
).setParseAction(lambda t:t[0]))
ret = Forward()
ret <<= Group( opener + ZeroOrMore( ignoreExpr | ret | content ) + closer )

ret.setName('nested %s%s expression' % (opener,closer))
return ret
# use {}'s for nested lists
macro = nestedExpr("{{", "}}")
# print(( (nestedItems+stringEnd).parseString(data).asList() ))
for toks, preloc, nextloc in macro.scanString(data):
print(toks)
return candidates
data = """
{{hello}}
{{hello world}}
{{hello much { }}
{{a {{b}}}}
{{a
td {

}
{{inner}}
}}
"""
getMacroCandidates(data)

这给了我删除的令牌和空格

[['{{', 'hello', '}}']]
[['{{', 'hello', 'world', '}}']]
[['{{', 'hello', 'much', '{', '}}']]
[['{{', 'a', ['{{', 'b', '}}'], '}}']]
[['{{', 'a', 'td', '{', '}', ['{{', 'inner', '}}'], '}}']]

你可以替换

data = """
{{hello}}
{{hello world}}
{{hello much { }}
{{a {{b}}}}
{{a
td {
}
{{inner}}
}}
"""
import shlex
data1= data.replace("{{",'"')
data2 = data1.replace("}}",'"')
data3=   data2.replace("}"," ")
data4=   data3.replace("{"," ")
data5= ' '.join(data4.split())
print(shlex.split(data5.replace("n"," ")))

输出

这将返回所有删除大括号和空格的标记,并删除额外的行空格

['hello', 'hello world', 'hello much ', 'a b', 'a td inner ']

PS:这可以做成单个表达式,多个表达式用于可读性

最新更新