Python解析复杂文本



我正在努力开发一种可以编辑以下XML文件片段的算法。有人能帮我出主意吗?需求是解析文件作为输入,删除使用"RC4"的"密码",并输出一个新的xml文件,只删除"RC4"密码。问题是XML文件中有多个"Connector"部分。我需要读取所有这些文件,但只编辑使用端口443并具有特定IP地址的文件。因此,脚本需要一次解析一个Connector部分,但丢弃那些没有正确IP地址和端口的部分。尝试:1. 使用ElementTree XML解析器。问题是它不能很好地输出新的XLM文件——它是一个烂摊子。我需要用python 2.6美化它。

<Connector
protocol="org.apache.coyote.http11.Http11NioProtocol"
port="443"
redirectPort="443"
executor="tomcatThreadPool"
disableUploadTimeout="true"
SSLEnabled="true"
scheme="https"
secure="true"
clientAuth="false"
sslEnabledProtocols="TLSv1,TLSv1.1,TLSv1.2"
keystoreType="JKS"
keystoreFile="tomcat.keystore"
keystorePass="XXXXX"
server="XXXX"
ciphers="TLS_DHE_RSA_WITH_AES_128_CBC_SHA,
         TLS_DH_RSA_WITH_AES_128_CBC_SHA,
         TLS_DHE_DSS_WITH_AES_128_CBC_SHA,
         TLS_DH_DSS_WITH_AES_128_CBC_SHA,
         TLS_RSA_WITH_AES_128_CBC_SHA,
         TLS_DHE_RSA_WITH_3DES_EDE_CBC_SHA,
         TLS_DH_RSA_WITH_3DES_EDE_CBC_SHA,
         TLS_RSA_WITH_3DES_EDE_CBC_SHA,
         TLS_RSA_WITH_RC4_128_SHA"
address="192.168.10.6">

下面是我的代码:
from xml.etree import ElementTree
print "[+] Checking for removal of RC4 ciphers"
file = "template.xml"
with open(file, 'rt') as f:
    tree = ElementTree.parse(f)
f.close()
for node in tree.getiterator('Connector'):
    if node.tag == 'Connector':
        address = node.attrib.get('address')
        port = node.attrib.get('port')
        if "EMSNodeMgmtIp" in address and port == "443":
            ciphers = node.attrib.get('ciphers')
            if "RC4" in ciphers:
                # If true, RC4 is enabled somewhere in the cipher suite 
                print "[+] Found RC4 enabled ciphers"
                # Find RC4 specific cipher suite string, for replacement
                elements = ciphers.split()
                search_str = ""
                for element in elements:
                    if "RC4" in element:
                        search_str = element
                        print "[+] Search removal RC4 string: %s" % search_str
                # Replace string by removing RC4 cipher
                print "[+] Removing RC4 cipher"
                replace_str = ciphers.replace(search_str,"")
                rstrip_str = replace_str.rstrip()
                if rstrip_str.endswith(','):
                    new_cipher_str = rstrip_str[:-1]
                    #print new_cipher_str
            node.set('ciphers', new_cipher_str)
tree.write('new.xml')

我添加了注释来解释正在发生的事情。inb4downvote

from lxml import etree
import re
xml = '''<?xml version="1.0"?>
<data>
<Connector
protocol="org.apache.coyote.http11.Http11NioProtocol"
port="443"
redirectPort="443"
executor="tomcatThreadPool"
disableUploadTimeout="true"
SSLEnabled="true"
scheme="https"
secure="true"
clientAuth="false"
sslEnabledProtocols="TLSv1,TLSv1.1,TLSv1.2"
keystoreType="JKS"
keystoreFile="tomcat.keystore"
keystorePass="XXXXX"
server="XXXX"
ciphers="TLS_DHE_RSA_WITH_AES_128_CBC_SHA,
         TLS_DH_RSA_WITH_AES_128_CBC_SHA,
         TLS_DHE_DSS_WITH_AES_128_CBC_SHA,
         TLS_DH_DSS_WITH_AES_128_CBC_SHA,
         TLS_RSA_WITH_AES_128_CBC_SHA,
         TLS_DHE_RSA_WITH_3DES_EDE_CBC_SHA,
         TLS_DH_RSA_WITH_3DES_EDE_CBC_SHA,
         TLS_RSA_WITH_3DES_EDE_CBC_SHA,
         TLS_RSA_WITH_RC4_128_SHA"
address="192.168.10.6"></Connector></data>'''
tree = etree.fromstring(xml)
root = tree.getroottree().getroot()
for connector in root.findall('Connector'):
    port = connector.get('port')
    ip = connector.get('address')
    #change this to port/ip you want to remove
    if port != '443' or ip != '192.168.10.6':
        #removes child connector
        connector.getparent().remove(connector)
        continue
    #here we use list comprehension to remove any cipher with "RC4"
    ciphers = ','.join([x for x in re.split(r',s*', connector.get('ciphers')) if 'RC4' not in x])
    #set the modified cipher back
    connector.set('ciphers', ciphers)
print etree.tostring(root, pretty_print=True)

如果XML工具不保留原始结构和格式,则转储它们。这是一个简单的文本处理问题,您可以编写一个Python程序来处理它。

通过文件的行旋转;只需在输出中回显除"cipher"语句以外的任何内容。当你点击其中一个:

  1. 把字符串塞进一个变量。
  2. 将字符串拆分为列表
  3. 删除任何包含"RC4"的列表元素。
  4. 将生成的"cipher"语句以所需的格式打印出来。
  5. 返回正常的"读取和回波"处理。

这个算法能让你继续吗?

答案如下。基本上必须将每个连接器部分(共有4个)读取到临时列表中,以检查端口和地址是否正确。如果是,则仅在启用RC4密码的情况下,通过删除密码字符串对Cipher进行更改。因此,代码必须将所有4个连接器一次一个地读入临时列表。

f = open('template.xml', 'r')
lines = f.readlines()
f.close()
new_file = open('new.xml', 'w')
tmp_list = []
connector = False
for line in lines:
    if '<Connector' in line:
        connector = True
        new_file.write(line)
    elif '</Connector>' in line:
        connector = False
        port = False
        address = False
        for a in tmp_list:
            if 'port="443"' in a:
                port = True
            elif 'address="%(EMSNodeMgmtIp)s"' in a:
                address = True
        if port and address:
            new_list = []
            count = 0
            for b in tmp_list:
                if "RC4" in b:
                    print "[+] Found RC4 cipher suite string at line index %d:  %s" % (count,b) 
                    print "[+] Removing RC4 cipher string from available cipher suites"
                    # check if RC4 cipher string ends with "
                    check = b[:-1]
                    if check.endswith('"'):
                        tmp_str = tmp_list[count-1]
                        tmp_str2 = tmp_str[:-2]
                        tmp_str2+='"n'
                        new_list[count-1] = tmp_str2
                        replace_line = b.replace(b,"")
                        new_list.append(replace_line)
                    else:
                        replace_line = b.replace(b,"")
                        new_list.append(replace_line)
                else:
                    new_list.append(b)
                count+=1
            for c in new_list:
                new_file.write(c) 
            new_file.write('    </Connector>n')
        else:
            # Not port and address
            for d in tmp_list:
                new_file.write(d)
            new_file.write('    </Connector>n')
        tmp_list = []
    elif connector:
        tmp_list.append(line)
    else:
        new_file.write(line)
new_file.close()

最新更新