Python - 重置 bytesIO,以便不附加下一个文件



我在Python中遇到BytesIO库的问题。我想转换从S3存储桶中检索到的pdf文件,并使用自定义函数convert_bytes_to_df将其转换为数据框架。第一个pdf文件可以转换为csv文件,但是随后的csv文件看起来像是相互附加的。我试图用seektruncate重置IO,但它似乎不起作用。我做错了什么?


import boto3
from io import BytesIO,StringIO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.ERROR)
logging.getLogger(__name__).setLevel(logging.DEBUG)
session = boto3.Session()
s3 = session.resource('s3')
src_bucket = s3.Bucket('input-bucket')
dest_bucket = s3.Bucket('output-bucket')
csv_buffer = StringIO()
def lambda_handler(event,context):
msg = event['Records'][0]['Sns']['Message']
pdf_files = json.loads(msg)['pdf_files']
location = json.loads(msg)['location']
total_files= len(pdf_files)
LOGGER.info('Processing: {}'.format(json.dumps(pdf_files)))
for pdf_file in pdf_files:
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)
LOGGER.info('Processed: {} in {}'.format(file_name,location))
LOGGER.info('Converted {} files: {}'.format(total_files,json.dumps(pdf_files)))

src_bucket.objects.all().delete()
LOGGER.info('Deleted all files from {}'.format(src_bucket.name))   

移动csv_buffer = StringIO()在for循环内。Csv_buffer只初始化一次。你需要把它放在for循环中,这样循环中的每个元素都会被初始化。

例句:

for pdf_file in pdf_files:
csv_buffer = StringIO()
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)

相关内容

最新更新