有没有一种简单的方法可以使用行索引将数据子集存储到DataFrames中,最后使用Python Pandas将其连接回C



我正在学习panda,并试图了解如何将DF.txt的结构化格式创建为具有以下列的DF.csv。下面的代码是我迄今为止所管理的。

import pandas as pd
import numpy as np
df = pd.read_csv('DF.txt', header=None)
# Get Row Index for Starting Rows
header_list = df[df[0].str.contains('kbytes')].index.values
# Get Row Index for Ending Rows
end_list = df[df[0].str.contains('^#')].index.values
# Create List if List (Start / End Row)
idx_list = []
for x in header_list:
y = end_list[np.where(x<end_list)]
if len(y)>0:        
idx_list.append([x, y[0]]) # We are using the first number in array y e.g. y[0]
else:
idx_list.append([x,]) # End of file has no y[0]s
print (idx_list)
for idxs in idx_list:
if len(idxs)>1:
# df[idx_list[0][0]:idx_list[0][1]] - first block of rows df[2:137]
# df[idx_list[1][0]:idx_list[1][1]] - second block of rows df[139:274]
# df[idx_list[2][0]:idx_list[2][1]] - third block of rows df[276:417]
print (idxs)
# Need to extract following values
# Filesystem
# kbytes
# used
# avail
# capacity
# Mounted on            
# Date (e.g. 2 rows up and extract dt.strftime("%d/%m/%Y %H:%M:%S"))
# Type (e.g. 2 rows up and extract '((.*?))')
# hostname (e.g. 1 row up and split by :)
# serialno (e.g. 1 row up and split by :)

这为您提供了第5行到第8行。df.loc[5:8,:]

本质上是在寻找一种更容易的方式来实现以下目标-

for idxs in idx_list:
if len(idxs)>1:
#print(idxs)                   # Print all lists in the list
#print df.loc[idxs[0]:idxs[1]] # Return dataframe with rows from first list
#print df[idxs[0]:idxs[1]][0]  # Return an array for column (index 0) instead of dataframe
print df[idxs[0]:idxs[1]][0][1:]  # skip first items from array
# lambda function to iterate each item split by whitespace (default) and return the first item (index 0 i.e. e.g. Filesystem)
#df.loc[idxs[0]:idxs[1],'Filesystem'] = df[idxs[0]:idxs[1]][0][2:].apply(lambda x: str.split(x)[0])
for idxs in idx_list:
if len(idxs)>1:
print(idxs)
df.loc[idxs[0]:idxs[1],'Filesystem'] = df[idxs[0]:idxs[1]][0][1:].apply(lambda x: str.split(x)[0])
df.loc[idxs[0]:idxs[1],'kbytes'] = df[idxs[0]:idxs[1]][0][1:].apply(lambda x: str.split(x)[1])
df.loc[idxs[0]:idxs[1],'used'] = df[idxs[0]:idxs[1]][0][1:].apply(lambda x: str.split(x)[2])
df.loc[idxs[0]:idxs[1],'avail'] = df[idxs[0]:idxs[1]][0][1:].apply(lambda x: str.split(x)[3])
df.loc[idxs[0]:idxs[1],'capacity'] = df[idxs[0]:idxs[1]][0][1:].apply(lambda x: str.split(x)[4])
df.loc[idxs[0]:idxs[1],'Mounted on'] = df[idxs[0]:idxs[1]][0][1:].apply(lambda x: str.split(x)[5])
df.loc[idxs[0]:idxs[1],'date'] = pd.to_datetime(df[idxs[0]-2:idxs[1]-1][0].str[6:26][0:1]).dt.strftime("%d/%m/%Y %H:%M:%S").values[0]
df.loc[idxs[0]:idxs[1],'ASUP_Type'] = df[idxs[0]-2:idxs[1]-1][0].str.extract('((.*?))', expand=True)[0].values[0]
df.loc[idxs[0]:idxs[1],'hostname'] = df[idxs[0]-1:idxs[1]][0].str.split().values[0][2]
df.loc[idxs[0]:idxs[1],'serial_no'] = df[idxs[0]-1:idxs[1]][0].str.split().values[0][4]
else:
print(idxs)
df.loc[idxs[0]:,'Filesystem'] = df[idxs[0]:][0][1:].apply(lambda x: str.split(x)[0])
df.loc[idxs[0]:,'kbytes'] = df[idxs[0]:][0][1:].apply(lambda x: str.split(x)[1])
df.loc[idxs[0]:,'used'] = df[idxs[0]:][0][1:].apply(lambda x: str.split(x)[2])
df.loc[idxs[0]:,'avail'] = df[idxs[0]:][0][1:].apply(lambda x: str.split(x)[3])
df.loc[idxs[0]:,'capacity'] = df[idxs[0]:][0][1:].apply(lambda x: str.split(x)[4])
df.loc[idxs[0]:,'Mounted on'] = df[idxs[0]:][0][1:].apply(lambda x: str.split(x)[5])
df.loc[idxs[0]:,'date'] = pd.to_datetime(df[idxs[0]-2:-1][0].str[6:26][0:1]).dt.strftime("%d/%m/%Y %H:%M:%S").values[0]
df.loc[idxs[0]:,'ASUP_Type'] = df[idxs[0]-2:-1][0].str.extract('((.*?))', expand=True)[0].values[0]
df.loc[idxs[0]:,'hostname'] = df[idxs[0]-1:][0].str.split().values[0][2]
df.loc[idxs[0]:,'serial_no'] = df[idxs[0]-1:][0].str.split().values[0][4]

最新更新