如何读取无序的文本数据?

我想逐行读取文本数据到内存中，并将其写入不同格式的文件。数据看起来像这样:

15LI     aLI   15   9.34   5.31   5.53
15LI     aLI   15   9.51   4.55   5.54
15LI     aLI   15   9.45   4.30   5.47
15LI     aLI   15  10.29   3.77   5.91
15LI     aLI   15 -97.89 -21.55   5.47
15LI     aLI   15 -97.85 -21.69   5.88
15LI     aLI   15 -96.61 -21.03   5.24
15LI     aLI   15-103.25  -9.02   5.24
15LI     aLI   15-102.55  -9.73   5.07
15LI     aLI   15-102.54  -9.70   5.64
15LI     aLI   15-102.40  -9.68   5.54

可以看到，列3和列4之间的空格随着数字的增加而消失。我使用的是numpy.genfromtxt要读取数据，但无法读取第六行之后的数据，并抛出以下错误:

ValueError: Some errors were detected !
Line #7 (got 5 columns instead of 6)
Line #8 (got 5 columns instead of 6)
Line #9 (got 5 columns instead of 6)
Line #10 (got 5 columns instead of 6)

在python中有什么方法可以在列之间创建空格或在没有空格的情况下读取它们吗?下面是我的小代码:

import h5py
import numpy as np
#define a np.dtype for gro array/dataset (hard-coded for now)
gro_dt = np.dtype([('col1', 'S4'), ('col2', 'S4'), ('col3', int), 
('col4', float), ('col5', float), ('col6', float)])
# Next, create an empty .h5 file with the dtype
with h5py.File('pep.h5', 'w') as hdf:
ds= hdf.create_dataset('dataset1', dtype=gro_dt, shape=(20,), maxshape=(None,)) 
# Next read line 1 of .gro file
f = open('testing.dat', 'r')
data = f.readlines()
ds.attrs["Source"]=data[0]
f.close()
# loop to read rows from 2 until end
skip, incr, row0 = 0, 20, 0 
read_gro = True
while read_gro:
arr = np.genfromtxt('testing.dat', skip_header=skip, max_rows=incr, dtype=gro_dt)
rows = arr.shape[0]
if rows == 0:
read_gro = False 
else:    
if row0+rows > ds.shape[0] :
ds.resize((row0+rows,))
ds[row0:row0+rows] = arr
skip += rows
row0 += rows

您可以手动解析数据，然后将其转储为您需要的任何格式。这个答案基于这样一个假设:"前3列保持不变。">

import pandas as pd
def gen_lines(filename):
wanted = 6
with open(filename, "r") as fin:
for line in fin:
parts = line.split()
if len(parts) < wanted:
first = parts[:2]
final = parts[3:]
middle = parts[2].split("-")
middle[-1] = "-" + middle[-1]
yield first + middle + final
else:
yield parts
lines = gen_lines("foo.txt")
df = pd.DataFrame(lines)
print(df)
0    1   2        3       4     5
0   15LI  aLI  15     9.34    5.31  5.53
1   15LI  aLI  15     9.51    4.55  5.54
2   15LI  aLI  15     9.45    4.30  5.47
3   15LI  aLI  15    10.29    3.77  5.91
4   15LI  aLI  15   -97.89  -21.55  5.47
5   15LI  aLI  15   -97.85  -21.69  5.88
6   15LI  aLI  15   -96.61  -21.03  5.24
7   15LI  aLI  15  -103.25   -9.02  5.24
8   15LI  aLI  15  -102.55   -9.73  5.07
9   15LI  aLI  15  -102.54   -9.70  5.64
10  15LI  aLI  15  -102.40   -9.68  5.54

尝试使用pd.read_fwf。根据需要调整文件的宽度。

import pandas as pd
df = pd.read_fwf("test.txt", widths=[7, 8, 5, 7, 7, 7])
>>> df
0    1   2       3      4     5
0   15LI  aLI  15    9.34   5.31  5.53
1   15LI  aLI  15    9.51   4.55  5.54
2   15LI  aLI  15    9.45   4.30  5.47
3   15LI  aLI  15   10.29   3.77  5.91
4   15LI  aLI  15  -97.89 -21.55  5.47
5   15LI  aLI  15  -97.85 -21.69  5.88
6   15LI  aLI  15  -96.61 -21.03  5.24
7   15LI  aLI  15 -103.25  -9.02  5.24
8   15LI  aLI  15 -102.55  -9.73  5.07
9   15LI  aLI  15 -102.54  -9.70  5.64
10  15LI  aLI  15 -102.40  -9.68  5.54
#as a numpy array
array = pd.read_fwf("test.txt", widths=[7, 8, 5, 7, 7, 7]).values
>>> array
array([['15LI', 'aLI', 15, 9.34, 5.31, 5.53],
['15LI', 'aLI', 15, 9.51, 4.55, 5.54],
['15LI', 'aLI', 15, 9.45, 4.3, 5.47],
['15LI', 'aLI', 15, 10.29, 3.77, 5.91],
['15LI', 'aLI', 15, -97.89, -21.55, 5.47],
['15LI', 'aLI', 15, -97.85, -21.69, 5.88],
['15LI', 'aLI', 15, -96.61, -21.03, 5.24],
['15LI', 'aLI', 15, -103.25, -9.02, 5.24],
['15LI', 'aLI', 15, -102.55, -9.73, 5.07],
['15LI', 'aLI', 15, -102.54, -9.7, 5.64],
['15LI', 'aLI', 15, -102.4, -9.68, 5.54]], dtype=object)

相关内容

最新更新

热门标签：