我该如何创建



我有一个文本文件,需要逐行读取,并转换为具有以下4列的数据帧

import re
import pandas as pd
with open('/Users/Desktop/Final Semester Fall 2022/archive/combined_data_1.txt',encoding='latin-1') as f:  
for line in f:
result = re.search(r"^(d+),(d+),(d{4}-d{2}-d{2})/gm", line)
if re.search(r"(^d+):", line) is not None:
movie_id = re.search(r"(^d+):", line).group(1)
elif result:
customerid = result.group(1)
rating = result.group(2)
date = result.group(3) 
else:
continue 
data_list = [customerid, rating, date, movie_id]    
df1 = pd.DataFrame(data_list)
df1.to_csv(r'/Users/Desktop/Final Semester Fall 2022/archive/combineddata1.csv')


我得到以下错误:

如何修复此错误???

提前感谢!!

这里有一种方法可以实现


# read the csv file using read_csv, using ":" as a separator
# since there is only one colon ":" per movie, you end up with a row for movie following by rows for the rest of the data.
df=pd.read_csv(r'c:csv.csv', sep=':', header=None, names=['col1', 'col2'])
# when there is no comma in a row, means its only a movie id, 
# so we populate the movieid column and downfill for all rows
df['MovieId'] = df['col1'].mask(df['col1'].str.contains(',')).ffill()
# split the data into CusotmerId, rating and date
df[['CustomerID','Rating','Date']] = df['col1'].str.split(',',expand=True)
# drop the unwanted columns and rows
df2=df[df['col1'].ne(df['MovieId'])].drop(columns=['col1','col2'])
df2
# sample created from the data you shared above as image
MovieId     CustomerID  Rating  Date
1         1     1488844       3     2005-09-06
2         1     822109        5     2005-05-13
3         1     885013        4     2005-10-19
4         1     30878         4     2005-12-26
5         1     823519        3     2004-05-03
6         1     893988        3     2005-11-17
7         1     124105        4     2004-08-05
8         1     1248629       3     2004-04-22
9         1     1842128       4     2004-05-09
10        1     2238063       3     2005-05-11
11        1     1503895       4     2005-05-19
13        2     1288844       3     2005-09-06
14        2     832109        5     2005-05-13

您可以很容易地解析该结构(无需正则表达式,使用几行可读性很强的普通Python(,并在读取数据文件时构建字典。然后,您可以一次性将字典转换为DataFrame。

import pandas as pd
df = {'MovieID':[], 'CustomerID':[],  'Rating':[],  'Date':[]}
with open('data.txt', 'r') as f:
for line in f:
line = line.strip()
if line: #skip empty lines
if line.endswith(':'): #MovieID
movie_id = line[:-1]
else:
customer_id, rating, date = line.split(',')
df['MovieID'].append(movie_id)
df['CustomerID'].append(customer_id)
df['Rating'].append(rating)
df['Date'].append(date)

df = pd.DataFrame(df)
print(df)
MovieID CustomerID Rating        Date
0        1    1488844      3  2005-09-06
1        1     822109      5  2005-05-13
2        1     885013      4  2005-10-19
3        1      30878      4  2005-12-26
4        2     823519      3  2004-05-03
5        2     893988      3  2005-11-17
6        2     124105      4  2004-08-05
7        2    1248629      3  2004-04-22
8        2    1842128      4  2004-05-09
9        3    2238063      3  2005-05-11
10       3    1503895      4  2005-05-19
11       3    1288844      3  2005-09-06
12       3     832109      5  2005-05-13

没有什么比这更容易的了。

正则表达式中的错误

因为在用于标识result的正则表达式中有/gm,所以得到了NameError

我想/gm是被错误地复制到这里的。在其他语言中,这可能是GLOBAL和MULTILINE匹配修饰符,顺便说一句,在这种情况下不需要它们。但在pythonre模块中,它们只有三个字符。由于没有包含/gm的行,所以result始终是None,因此elif result: ...块从未执行,变量customeridratingdate也未初始化。

使用变量时出错

如果从第一个匹配中删除/gm,则会出现另一个问题:变量customerid, rating, date, movie_id只是字符串,因此生成的数据帧将仅反映源文件的最后一条记录。

为了避免这种情况,我们必须使用类似列表的结构来处理它们。例如,在下面的代码中,它们是data字典中的键,每个键都指向一个单独的列表:

file_name = ...
data = {'movie_id': [], 'customerid': [], 'rating': [], 'date': []}    
with open(file_name, encoding='latin-1') as f:
for line in f:
result = re.search(r"^(d+),(d+),(d{4}-d{2}-d{2})", line)
if re.search(r"(^d+):", line) is not None:
movie_id = re.search(r"(^d+):", line).group(1)
elif result:
data['movie_id'].append(movie_id)
data['customerid'].append(result.group(1))
data['rating'].append(result.group(2))
data['date'].append(result.group(3))
else:
continue 
df = pd.DataFrame(data)

带有测试数据的代码

import re
import pandas as pd
data = '''
1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26
2:
823519,3,2004-05-03
893988,3,2005-11-17
124105,4,2004-08-05
1248629,3,2004-04-22
1842128,4,2004-05-09
3:
2238063,3,2005-05-11
1503895,4,2005-05-19
1288844,3,2005-09-06
832109,5,2005-05-13
'''
file_name = "data.txt"
with open(file_name, 'tw', encoding='latin-1') as f:
f.write(data)
data = {'movie_id': [], 'customerid': [], 'rating': [], 'date': []}    
with open(file_name, encoding='latin-1') as f:
for line in f:
result = re.search(r"^(d+),(d+),(d{4}-d{2}-d{2})", line)
if re.search(r"(^d+):", line) is not None:
movie_id = re.search(r"(^d+):", line).group(1)
elif result:
data['movie_id'].append(movie_id)
data['customerid'].append(result.group(1))
data['rating'].append(result.group(2))
data['date'].append(result.group(3))
else:
continue 
df = pd.DataFrame(data)
df.to_csv(file_name[:-3] + 'csv', index=False)

另一种选择

df = pd.read_csv(file_name, names = ['customerid', 'rating', 'date'])
df.insert(0, 'movie_id', pd.NA)
isnot_movie_id = ~df['customerid'].str.endswith(':')
df['movie_id'] = df['customerid'].mask(isnot_movie_id).ffill().str[:-1]
df = df.dropna().reset_index(drop=True)

相关内容

  • 没有找到相关文章

最新更新