从复杂文件中提取数据



>我有几个数据文件的格式如下:

Vectors                                                           #line 1
1.0000000000                                                      #line 2
12.6482149405358335   -0.0349648302867427   -0.0028442567806109  #line 3
-6.3543320038358670   10.3969225056801626    0.0194632704565655  #line 4
-0.0024866905306129    0.0181600055076972   10.8458884055842208  #line 5
Points   11                                                       #line 6
0.4305655209782699  0.8426201879889135  0.9003812234191031      #line 7
0.6803904862706389  0.8422628581334592  0.8996231506556995      #line 8
0.9310883880788197  0.8426914063929090  0.8988948816913620      #line 9
0.1484226061614464  0.0918229591830411  0.5985391478206523      #line 10
0.0871426252853240  0.4056041945139057  0.1025537266432366      #line 11
0.6516284151939402  0.0944378154637135  0.6031197588423964      #line 12
0.8977067858298895  0.0915250471034454  0.5994045156799004      #line 13
0.1742665415904057  0.3420150165219624  0.6031464526385122      #line 14
0.4219486318660017  0.3397275847827785  0.5972818153032335      #line 15
0.6818585097829839  0.3402603219764870  0.5960939583330003      #line 16
0.6605938016347537  0.8211262131757806  0.2511635759660038      #line 17

第 3 行到第 5 行包含向量的坐标。第 7 行到末尾包含点的坐标。我需要将每个向量和点的坐标 x、y、z 从数据文件导入到 2 个相应的矩阵/列表:向量和点以供下一次计算。你能给我一个解决方案吗?这很复杂!非常感谢!

import subprocess
import os
import sys
import math
import re
if __name__ == "__main__":
with open('data.txt') as f:
p = 11 #int(subprocess.call("grep Points"))
dataline = f.readlines()
re_vectors= re.compile("Vectors")
re_points = re.compile("Points")
vector_x = [], vector_y = [], vector_z = []
point_x, point_y, point_z = [], [], []
for line in dataline:
if re_vectors.search(line):
for i in range(0, 3):
parts = dataline[i+2].split()
vector_x = float(parts[0])
vector_y = float(parts[1])
vector_z = float(parts[2])
print (vector_x, vector_y, vector_z)
if re_points.search(line):
for j in range(0, p):
parts = dataline[j+7].split()
point_x = float(parts[0])
point_y = float(parts[1])
point_z = float(parts[2])
print (point_x, point_y, point_z)

两个列表,一个用于向量,一个用于点。这就是你想要的吗?

with open('data.txt', 'r') as f:
s = f.readlines()
vectors = []
points = []
for row in s[1:5]:
try:
vector = row.split()
vectors.append({'x': vector[0], 'y': vector[1], 'z': vector[2]})
except:
vector = row.split()
vectors.append(vector)
for row in s[6:]:
try:
point = row.split()
points.append({'x':point[0], 'y':point[1], 'z':point[2]})
except:
point = row.split()
points.append(point)
print(vectors)
print(points)

结果

Vectors
['1.0000000000']
{'x': '12.6482149405358335', 'y': '-0.0349648302867427', 'z': '-0.0028442567806109'}
{'x': '-6.3543320038358670', 'y': '10.3969225056801626', 'z': '0.0194632704565655'}
{'x': '-0.0024866905306129', 'y': '0.0181600055076972', 'z': '10.8458884055842208'}
Points
{'x': '0.4305655209782699', 'y': '0.8426201879889135', 'z': '0.9003812234191031'}
{'x': '0.6803904862706389', 'y': '0.8422628581334592', 'z': '0.8996231506556995'}
{'x': '0.9310883880788197', 'y': '0.8426914063929090', 'z': '0.8988948816913620'}
{'x': '0.1484226061614464', 'y': '0.0918229591830411', 'z': '0.5985391478206523'}
{'x': '0.0871426252853240', 'y': '0.4056041945139057', 'z': '0.1025537266432366'}
{'x': '0.6516284151939402', 'y': '0.0944378154637135', 'z': '0.6031197588423964'}
{'x': '0.8977067858298895', 'y': '0.0915250471034454', 'z': '0.5994045156799004'}
{'x': '0.1742665415904057', 'y': '0.3420150165219624', 'z': '0.6031464526385122'}
{'x': '0.4219486318660017', 'y': '0.3397275847827785', 'z': '0.5972818153032335'}
{'x': '0.6818585097829839', 'y': '0.3402603219764870', 'z': '0.5960939583330003'}
{'x': '0.6605938016347537', 'y': '0.8211262131757806', 'z': '0.2511635759660038'}
  1. 不要使用正则表达式进行简单的子字符串检查。使用substring in string.
  2. 不要.readlines()- 这样你就可以将所有数据放在内存中。对于较大的文件,它会很痛。
  3. 如果我是你,我会将向量和点存储为元组列表,而不是 3 个单独的列表。

假设向量总是排在第一位的,我们只需要一个标志来确定我们是否已经在做点。

if __name__ == "__main__":
with open('data.txt') as f:
p = None # we'll read this as well
vectors = []
points = []
points_flag = False
for line in f:
parts = line.split()
if len(parts) == 2: 
points_flag = True # we don't even have to check line's contents, this line is the only one that has 2 parts
p = int(parts[1])
continue
elif len(parts) == 3:
if points_flag:
points.append((float(parts[0]), float(parts[1]), float(parts[2])))
else:
vectors.append((float(parts[0]), float(parts[1]), float(parts[2])))
# let's check the contents
print(vectors)
print(p)
print(points)
class Base:
"""Base contains the common functionality for points and vectors"""
__slots__ = '_x', '_y', '_z'
def __init__(self, x, y, z):
self._x = x
self._y = y
self._z = z
@property
def x(self):
return self._x
@property
def y(self):
return self._y
@property
def z(self):
return self._z

class Point(Base):
"""Empty for now, the common functionality is enough"""

class Vector(Base):
"""Empty for now, the common functionality is enough"""

vectors = []
paramenter = 0.0
n_points = 0
points = []
with open('data.txt', 'r') as f:
klass = None
for line in f:
if "Vectors" in line:
klass = Vector
continue
elif "Points" in line:
klass = Point
n_points = int(line.split()[1])
continue
parts = line.split()
if len(parts) == 1:
parameter = float(parts[0])
elif len(parts == 3):
if klass == Vector:
vectors.append(Vector(float(parts[0]), float(parts[1]), float(parts[2])))
elif klass == Point:
points.append(Point(float(parts[0]), float(parts[1]), float(parts[2])))
else:
raise Exception  # This will only happen if klass is something different
else:
raise ValueError  # This will only happen if it reads a line with 2 or more than 4 parts excluding the ones containing "Vectors" and "Points"
if n_points != len(points):
raise ValueError("Unexpected number of points")

最新更新