Python XML Sax截断不带特殊字符的字符串



我下载了一些KML格式的美国人口普查区文件。你可以在这里下载文件。我正在尝试获取区域名称和坐标边界。由于某些原因,某些坐标字段被截断,无法正确读取。例如;Bloomsburg Berwick Sunbury,PA";在KML文件中显示为

<coordinates>-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.008418,40.659912,0.0 -76.996995,40.635778,0.0 -76.965528,40.647149,0.0 -76.944828,40.650209,0.0 -76.939883,40.638142,0.0 -76.949148,40.628167,0.0 -76.918672,40.603466,0.0 -76.886411,40.617758,0.0 -76.864254,40.627585,0.0 -76.840104,40.625439,0.0 -76.810269,40.634526,0.0 -76.810044,40.640102,0.0 -76.804867,40.646839,0.0 -76.793851,40.640514,0.0 -76.745894,40.654464,0.0 -76.701624,40.658082,0.0 -76.700546,40.663114,0.0 -76.662137,40.674013,0.0 -76.562175,40.709007,0.0 -76.469523,40.743188,0.0 -76.380334,40.775445,0.0 -76.30717,40.801809,0.0 -76.2991,40.831191,0.0 -76.284611,40.883588,0.0 -76.207827,40.94974,0.0 -76.231194,41.050168,0.0 -76.228975,41.138466,0.0 -76.277639,41.131804,0.0 -76.317953,41.205453,0.0 -76.319957,41.211255,0.0 -76.310261,41.310198,0.0 -76.407934,41.308418,0.0 -76.447597,41.275629,0.0 -76.592607,41.157765,0.0 -76.640767,41.155718,0.0 -76.678776,41.154172,0.0 -76.732672,41.17204,0.0 -76.790807,41.175732,0.0 -76.828168,41.16578,0.0 -76.880963,41.158044,0.0 -76.884245,41.157099,0.0 -76.885228,41.155973,0.0 -76.888145,41.153807,0.0 -76.889338,41.151988,0.0 -76.889669,41.150791,0.0 -76.896114,41.13907,0.0 -76.960229,41.148801,0.0 -76.977939,41.087883,0.0 -77.058088,41.085575,0.0 -77.113839,41.069032,0.0 -77.144111,41.06884,0.0 -77.14416,41.044338,0.0 -77.204027,40.99271,0.0 -77.279236,40.90971,0.0 -77.36418,40.846937,0.0</coordinates>

但在1664中的第297个字符处被截断。对于其他人来说,这种情况似乎也是随机发生的。尺寸似乎不是问题。

['-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.00841']

我试过两台不同的ec2机器,所以我不认为这是内存/硬件问题。知道发生了什么事吗?

from xml.sax.handler import ContentHandler
from xml.sax import parse
class KMLHandler(ContentHandler):
def __init__(self):
super().__init__()
self.place_names = []
self.current_name = None
self.coordinates = []
self.temp_coordinates = []
self.start_placemark = False
self.capture_place_name = False
self.capture_cordinates = False
self.mapping_dict = {}
def startElement(self, name, attrs):
if name == 'Placemark':
self.first_placemark = True
self.start_placemark = True
self.temp_coordinates = []
self.current_name = None
else:
pass
if name == "SimpleData":
if attrs['name'] == "NAME":
self.capture_place_name = True
if name == "coordinates":
self.capture_cordinates = True
def endElement(self, name):
if name == "Placemark":
self.start_placemark = False
self.coordinates.append(self.temp_coordinates)
self.mapping_dict[self.current_name] = self.temp_coordinates
def characters(self, content):
if content.strip() != "":
if self.capture_place_name == True:
self.place_names.append(content)
self.current_name = content
self.capture_place_name = False
if self.capture_cordinates == True:
str_vals = [x.split(',')[0:2] for x in content.split(' ')]
self.temp_coordinates.append(content)
self.capture_cordinates = False
fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'
handler = KMLHandler()
parse(fname, handler)

如注释所示,每个characters事件都返回一个chunk,它可能是也可能不是整个标签内容。这类似于从网络上阅读;你可能不会一次得到所有东西。

我在下面重新编写了您的代码,它似乎为Berwick报告了正确的答案。在我的机器上,第一个块是283个字符,第二个块是1353个字符。283+11353=1636,这与文件中的数据的大小相匹配。

我认为捕获标记名称,然后在处理characters时进行测试,而不是使用一组布尔值更简单。只有一个控制值,它被设置为&在一个位置重置。

我认为不需要临时坐标。我不清楚你是希望坐标是一个列表还是什么,所以我只是抓住了字符串。

from xml.sax import parse
class KMLHandler(ContentHandler):
def __init__(self):
super().__init__()
self.place_names = []
self.current_name = None
self.coordinates = []
self.start_placemark = False
self.capture_place_name = False
self.mapping_dict = {}
self.capture = ''
def startElement(self, name, attrs):
self.capture = ''
if name == 'Placemark':
self.first_placemark = True
self.start_placemark = True
self.current_name = None
else:
pass
if name == "SimpleData":
if attrs['name'] == "NAME":
self.capture = name
if name == "coordinates":
self.capture = name
def endElement(self, name):
if name == "Placemark":
self.start_placemark = False
self.mapping_dict[self.current_name] = self.coordinates
self.coordinates = []
def characters(self, content):
if content.strip() != "":
if self.capture == 'SimpleData':
self.place_names.append(content)
self.current_name = content
self.capture_place_name = False
if self.capture == "coordinates":
self.coordinates.append(content)
print( '%d coordinates for %s: {%s}' % (len(content),
self.current_name,
self.coordinates) )
fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'
handler = KMLHandler()
parse(fname, handler)

最新更新