如何在Python中编写一个不带Pandas的函数来获取文件



我有3个.dat文件。我想写一个函数,它将这3个文件作为输入,并在不使用panda的情况下为合并数据写一个文本文件(作为output.dat(。合并后的数据应包括以下顺序:

user_id 
movie_id 
rating 
timestamp 
gender 
age 
occupation 
zip 
title 
genres 
Year

没有熊猫

from itertools import groupby, product
def read_data(filename, columns):
with open(filename, mode='r') as f:
rows = [line.split('::') for line in f.read().splitlines()]
records = [{name: value for name, value in zip(columns, row)} for row in rows]
return records
def merge(a, b, on):
key = lambda d: d.get(on)
lt = {k: list(g) for k, g in groupby(sorted(a, key=key), key=key)}
rt = {k: list(g) for k, g in groupby(sorted(b, key=key), key=key)}
out = [
{**a, **b}
for k in set(lt).intersection(rt)
for a, b in product(lt[k], rt[k])
]
return out
def write_merge(fp, merged, sortby=None, fields=None, sep=None):
if sortby is not None:
merged = sorted(merged, key=lambda e: tuple(e[k] for k in sortby))
if merged and fields is None:
fields = list(merged[0].keys())
for i, row in enumerate(merged):
if i > 0 and sep is not None:
print(sep, file=fp)
for k in fields:
print(f'{k}={row.get(k)}', file=fp)

应用:

users = read_data('users.dat', unames)
ratings = read_data('ratings.dat', rnames)
movies = read_data('movies.dat', mnames)
merged = merge(merge(users, ratings, on='user_id'), movies, on='movie_id')
fields = 'user_id movie_id rating timestamp gender age occupation zip title genres'.split()
sortby = fields
with open('output.dat', mode='w') as f:
write_merge(f, merged, sortby=sortby, fields=fields, sep='')

简单测试(稍微修改一下数据,使内部合并不为空(:

准备:

mov = """1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
"""
rat = """1::1::5::978300760
1::2::3::978302109
1::914::3::978301968
2::1::4::978300275
2::3::5::978824291
"""
usr = """1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455
"""
with open('users.dat', mode='w') as f: f.write(usr)
with open('ratings.dat', mode='w') as f: f.write(rat)
with open('movies.dat', mode='w') as f: f.write(mov)

测试:

unames = 'user_id gender age occupation zip'.split()
rnames = 'user_id movie_id rating timestamp'.split()
mnames = 'movie_id title genres'.split()
users = read_data('users.dat', unames)
ratings = read_data('ratings.dat', rnames)
movies = read_data('movies.dat', mnames)
merged = merge(merge(users, ratings, on='user_id'), movies, on='movie_id')
# optional: quick vis using Pandas, just to test:
display(pd.DataFrame(merged).sort_values(by=sortby))
# save to stdout for inspection
write_merge(sys.stdout, merged, sortby=sortby, fields=fields, sep='------')

输出:

user_id=1
movie_id=1
rating=5
timestamp=978300760
gender=F
age=1
occupation=10
zip=48067
title=Toy Story (1995)
genres=Animation|Children's|Comedy
------
user_id=1
movie_id=2
rating=3
timestamp=978302109
gender=F
age=1
occupation=10
zip=48067
title=Jumanji (1995)
genres=Adventure|Children's|Fantasy
------
(...)

假设您的文件保存在local drive = /home/your/path/to/drive中。另外,您的三个dat文件的名称中应该有一些唯一的标识符。例如unames.dat、rnames.dat等

import os
import pandas as pd
unames=["user_id","gender","age","occupation","zip"]
rnames=["user_id","movie_id","rating","timestamp"]
df_f =[]
#Loop block
files_uname = [f for f in os.listdir(local_drive) if 'uname' in os.path.basename(f) and f.endswith('.dat')]
for f in files_uname:
df = pd.read_csv(f,sep = '::',names=unames)
df_f.append(df)

对所有三种类型的文件运行此Loop block最后是

df_final = pd.concat(df_f,axis=0)

最新更新