我有一个文本文件,需要逐行读取,并转换为具有以下4列的数据帧
import re
import pandas as pd
with open('/Users/Desktop/Final Semester Fall 2022/archive/combined_data_1.txt',encoding='latin-1') as f:
for line in f:
result = re.search(r"^(d+),(d+),(d{4}-d{2}-d{2})/gm", line)
if re.search(r"(^d+):", line) is not None:
movie_id = re.search(r"(^d+):", line).group(1)
elif result:
customerid = result.group(1)
rating = result.group(2)
date = result.group(3)
else:
continue
data_list = [customerid, rating, date, movie_id]
df1 = pd.DataFrame(data_list)
df1.to_csv(r'/Users/Desktop/Final Semester Fall 2022/archive/combineddata1.csv')
我得到以下错误:
如何修复此错误???
提前感谢!!
这里有一种方法可以实现
# read the csv file using read_csv, using ":" as a separator
# since there is only one colon ":" per movie, you end up with a row for movie following by rows for the rest of the data.
df=pd.read_csv(r'c:csv.csv', sep=':', header=None, names=['col1', 'col2'])
# when there is no comma in a row, means its only a movie id,
# so we populate the movieid column and downfill for all rows
df['MovieId'] = df['col1'].mask(df['col1'].str.contains(',')).ffill()
# split the data into CusotmerId, rating and date
df[['CustomerID','Rating','Date']] = df['col1'].str.split(',',expand=True)
# drop the unwanted columns and rows
df2=df[df['col1'].ne(df['MovieId'])].drop(columns=['col1','col2'])
df2
# sample created from the data you shared above as image
MovieId CustomerID Rating Date
1 1 1488844 3 2005-09-06
2 1 822109 5 2005-05-13
3 1 885013 4 2005-10-19
4 1 30878 4 2005-12-26
5 1 823519 3 2004-05-03
6 1 893988 3 2005-11-17
7 1 124105 4 2004-08-05
8 1 1248629 3 2004-04-22
9 1 1842128 4 2004-05-09
10 1 2238063 3 2005-05-11
11 1 1503895 4 2005-05-19
13 2 1288844 3 2005-09-06
14 2 832109 5 2005-05-13
您可以很容易地解析该结构(无需正则表达式,使用几行可读性很强的普通Python(,并在读取数据文件时构建字典。然后,您可以一次性将字典转换为DataFrame。
import pandas as pd
df = {'MovieID':[], 'CustomerID':[], 'Rating':[], 'Date':[]}
with open('data.txt', 'r') as f:
for line in f:
line = line.strip()
if line: #skip empty lines
if line.endswith(':'): #MovieID
movie_id = line[:-1]
else:
customer_id, rating, date = line.split(',')
df['MovieID'].append(movie_id)
df['CustomerID'].append(customer_id)
df['Rating'].append(rating)
df['Date'].append(date)
df = pd.DataFrame(df)
print(df)
MovieID CustomerID Rating Date
0 1 1488844 3 2005-09-06
1 1 822109 5 2005-05-13
2 1 885013 4 2005-10-19
3 1 30878 4 2005-12-26
4 2 823519 3 2004-05-03
5 2 893988 3 2005-11-17
6 2 124105 4 2004-08-05
7 2 1248629 3 2004-04-22
8 2 1842128 4 2004-05-09
9 3 2238063 3 2005-05-11
10 3 1503895 4 2005-05-19
11 3 1288844 3 2005-09-06
12 3 832109 5 2005-05-13
没有什么比这更容易的了。
正则表达式中的错误
因为在用于标识result
的正则表达式中有/gm
,所以得到了NameError
。
我想/gm
是被错误地复制到这里的。在其他语言中,这可能是GLOBAL和MULTILINE匹配修饰符,顺便说一句,在这种情况下不需要它们。但在pythonre
模块中,它们只有三个字符。由于没有包含/gm
的行,所以result
始终是None
,因此elif result: ...
块从未执行,变量customerid
、rating
、date
也未初始化。
使用变量时出错
如果从第一个匹配中删除/gm
,则会出现另一个问题:变量customerid, rating, date, movie_id
只是字符串,因此生成的数据帧将仅反映源文件的最后一条记录。
为了避免这种情况,我们必须使用类似列表的结构来处理它们。例如,在下面的代码中,它们是data
字典中的键,每个键都指向一个单独的列表:
file_name = ...
data = {'movie_id': [], 'customerid': [], 'rating': [], 'date': []}
with open(file_name, encoding='latin-1') as f:
for line in f:
result = re.search(r"^(d+),(d+),(d{4}-d{2}-d{2})", line)
if re.search(r"(^d+):", line) is not None:
movie_id = re.search(r"(^d+):", line).group(1)
elif result:
data['movie_id'].append(movie_id)
data['customerid'].append(result.group(1))
data['rating'].append(result.group(2))
data['date'].append(result.group(3))
else:
continue
df = pd.DataFrame(data)
带有测试数据的代码
import re
import pandas as pd
data = '''
1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26
2:
823519,3,2004-05-03
893988,3,2005-11-17
124105,4,2004-08-05
1248629,3,2004-04-22
1842128,4,2004-05-09
3:
2238063,3,2005-05-11
1503895,4,2005-05-19
1288844,3,2005-09-06
832109,5,2005-05-13
'''
file_name = "data.txt"
with open(file_name, 'tw', encoding='latin-1') as f:
f.write(data)
data = {'movie_id': [], 'customerid': [], 'rating': [], 'date': []}
with open(file_name, encoding='latin-1') as f:
for line in f:
result = re.search(r"^(d+),(d+),(d{4}-d{2}-d{2})", line)
if re.search(r"(^d+):", line) is not None:
movie_id = re.search(r"(^d+):", line).group(1)
elif result:
data['movie_id'].append(movie_id)
data['customerid'].append(result.group(1))
data['rating'].append(result.group(2))
data['date'].append(result.group(3))
else:
continue
df = pd.DataFrame(data)
df.to_csv(file_name[:-3] + 'csv', index=False)
另一种选择
df = pd.read_csv(file_name, names = ['customerid', 'rating', 'date'])
df.insert(0, 'movie_id', pd.NA)
isnot_movie_id = ~df['customerid'].str.endswith(':')
df['movie_id'] = df['customerid'].mask(isnot_movie_id).ffill().str[:-1]
df = df.dropna().reset_index(drop=True)