如何将pickle文件的文件夹转换为单个csv文件



我有一个目录,里面有大约1700个pickle文件,每个文件都是用户的推特帖子,我想把它转换成CSV文件的文件夹,每个CSV文件名都是pickle的文件名,每行都包含一条用户的推文。。。在那之后,我只想要前20个CSV,比其他人有更多的样本。。。我该怎么做?

# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
#     continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")

open_dir_in_dict("Raw/")

我为它编写了示例代码,但它不起作用。。。

def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")]  # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")

# open_dir_in_dict("Raw/")

还有更好的答案。。。

import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"httpS+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")})  # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )

最新更新