如何阻止Pytest附加到初始测试后创建的CSV



我正在测试一类函数,这些函数将特定的转换应用于从S3存储桶中检索到的csv文件的列。测试函数应该从使用levels_etl和levels_etl_with_test_csv_data fixture创建的S3 bucket中检索"test_data.csv"文件。使用应用的转换创建一个新的csv。

我遇到的问题是,当单独运行时,每个测试函数都通过了,但当作为类的一部分运行时,第一个测试成功运行,但所有其他测试都失败了,因为出于某种原因,CSV输出没有创建应用了转换的新CSV,而是附加到上一个测试中创建的CSV,从而导致断言失败,每个连续的测试都附加到CSV。

设置代码:

@pytest.fixture
def levels_etl():
# Mocking S3 connection start
mock_bucket=mock_s3()
mock_bucket.start()
# Defining Class Arguments
s3_access_key='AWS_ACCESS_KEY_ID'
s3_secret_key='AWS_SECRET_ACCESS_KEY'
s3_endpoint_url='https://s3.us-east-2.amazonaws.com'
s3_bucket_name='test-bucket'
# Creating s3 access keys as environment variables
os.environ[s3_access_key]='KEY1'
os.environ[s3_secret_key]='KEY2'
s3=boto3.resource(service_name='s3',endpoint_url=s3_endpoint_url)
s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={'LocationConstraint':'us-east-2'})
# Creating Test instance
s3_bucket_conn=S3BucketConnector(s3_access_key,s3_secret_key,s3_endpoint_url,s3_bucket_name)
levels_etl=Levels_ETL(s3_bucket_conn)
yield levels_etl
# Teardown
mock_bucket.stop()
@pytest.fixture
def levels_etl_with_test_csv_data(tmpdir_factory,levels_etl):
filename=str(tmpdir_factory.mktemp('data').join('test_data.csv'))
with open(filename,'w',encoding='UTF-8',newline='') as file:
writer=csv.writer(file)
writer.writerow(['date','company','location','title','level','specialisation','gender',
'years_of_experience','years_at_company','base_salary','stock','bonus'])
writer.writerows([['1/1/2017 11:33:27','Google','Sunnyvale, CA','Software Engineer','L3','android',
'male','1','0','120000','40000','15000'],
['4/20/2017 11:33:27','Apple','Austin, TX','Software Engineer','ICT2','iOS Development','female','1','0',
'90','30','20'],
['4/20/2017 11:33:27','Microsoft','Bellevue, WA','Product Manager','59','UX/UI','Male','0','0','0','0','0'],
['7/15/2017 11:33:27','Hubspot','Cambridge, MA, United States','Software Engineer','Junior',
'Site Reliability (SRE)','','','','135','5','0'],
['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
'11','2','215','100','40'],
['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
'11','2','215','100','40'],
['12/11/2017 11:33:27','spotify','New York, NY','Software Engineer','Engineer 1','fullstack developer','male',
'4','0','180','37.5','0'],
['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','augmented reality','male',
'20','5','204','50','20'],
['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','virtual reality','male',
'20','5','204','50','20'],
['3/30/2018 11:33:27','Netflix','Denver, CO','Software Engineer','E5','Web Development (front-end)','male',
'20','2','591','0','0'],
['4/7/2018 11:33:27','Sony Interactive Entertainment','San Francisco, CA','Software Engineer','L4',
'backend tools','male','6','6','103','5','32'],
['5/9/2018 11:33:27','Lyft','New York, NY','Data Scientist','t6','algorithms','male',
'6','3','200','200','0'],
['11/11/2018 11:33:27','Hudson River Trading','New York, NY','Software Engineer','L4',
'algorithm','male','6','4','431','0','1700'],
['4/7/2019 11:33:27','Facebook','Chicago, IL','Product Designer','IC4',
'user experience','female','7','0','143','40','22.7'],
['4/7/2019 11:33:27','Facebook','New York, NY','Product Designer','IC4',
'ux','female','7','2','173','40','0'],
['4/7/2019 11:33:27','Mango Voice','Salt Lake City, UT','Product Designer','l3',
'ui','female','5','3','74.5','0','0'],
['9/13/2020 11:33:27','No Salary Startup','Chicago, IL','Product Designer','',
'user interface','female','0','0','0','100','0'],
['4/7/2021 11:33:27','','Chicago, IL','','IC4','user experience','female','7','0','143','40','22.7'],
['4/7/2021 11:33:27','twitter','Washington, DC','software engineer','swe II',
'data','male','2','2','150','60','0']])
levels_etl.s3_bucket._bucket.upload_file(Filename=filename,Key='test_data.csv')
yield levels_etl
levels_etl.s3_bucket._bucket.delete_objects(Delete={
'Objects':[
{'Key':'test_data.csv'}
]
})

测试类函数(2个)

def test_transform_job_data(self,levels_etl_with_test_csv_data):
key_exp='test_data.csv'
levels_etl_with_test_csv_data.transform_job_data(key=key_exp)
jobdata_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='job_data.csv').get().get('Body').read().decode('UTF-8')
print('jobdata_csv',jobdata_csv)
job_data_df=pd.read_csv(StringIO(jobdata_csv))
assert list(job_data_df.select_dtypes(include=['float']).columns)==['years_of_experience','years_at_company',
'base_salary','stock','bonus']
assert job_data_df.duplicated().any()==False
assert ((job_data_df['base_salary']==0) & (job_data_df['stock']==0)).any()==False
assert ((job_data_df['company']=='') & (job_data_df['title']=='')).any()==False
assert job_data_df[job_data_df['company']=='Google']['base_salary'].values[0]==120000.00
assert job_data_df[job_data_df['company']=='Google']['stock'].values[0]==40000.00
assert job_data_df[job_data_df['company']=='Google']['bonus'].values[0]==15000.00
assert job_data_df[job_data_df['company']=='Apple']['base_salary'].values[0]==90000.00
assert job_data_df[job_data_df['company']=='Apple']['stock'].values[0]==30000.00
assert job_data_df[job_data_df['company']=='Apple']['bonus'].values[0]==10000.00

def test_transform_dates(self,levels_etl_with_test_csv_data):
key_exp='test_data.csv'
levels_etl_with_test_csv_data.transform_dates(key=key_exp)
date_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='date.csv').get().get('Body').read().decode('UTF-8')
print('date_csv',date_csv)
date_df=pd.read_csv(StringIO(date_csv))
assert list(date_df.columns)==['date','year','month','quarter']
assert date_df['date'].tolist()==['2017-01-01','2017-04-20','2017-04-20','2017-07-15',
'2017-10-11','2017-10-11','2017-12-11','2018-01-30','2018-01-30','2018-03-30','2018-04-07','2018-05-09',
'2018-11-11','2019-04-07','2019-04-07','2019-04-07','2020-09-13','2021-04-07','2021-04-07']
assert date_df['year'].tolist()==[2017,2017,2017,2017,2017,2017,2017,2018,2018,2018,2018,2018,2018,
2019,2019,2019,2020,2021,2021]
date_df['month'].tolist()==[1,4,4,7,10,10,12,1,1,3,4,5,11,4,4,4,9,4,4]
assert date_df['quarter'].tolist()==[1,2,2,3,4,4,4,1,1,1,2,2,4,2,2,2,3,2,2]

transform_job_data和transform_dates函数都从S3存储桶中检索"test_data.csv"文件,应用panda数据帧转换,然后转换回csv并将新的csv上传到S3。

通过第一次测试,我得到了预期的CSV输出:

jobdata_csv日期,公司,地点,职位,级别,专业,性别,经验年限,公司年限,基本工资,股票,奖金2017年1月1日11:33:27,谷歌,";加利福尼亚州桑尼维尔";,软件工程师,L3,安卓系统,男,1.0,0.0120000.040000.015000.02017年4月20日11:33:27,苹果,";德克萨斯州奥斯汀";,软件工程师,ICT2,iOS开发,女性,1.0,0.09000.030000.020000.02017年7月15日11:33:27,Hubspot,";Cambridge,MA,United States";,软件工程师,初级,现场可靠性(SRE),,,,13500005000.0,0.02017年11月10日11:33:27,Facebook,";Menlo Park,CA";,软件工程师,E5,生产,男,11.0,2.0215000.0100000.040000.012/11/2017 11:33:27,spotify,";纽约";,软件工程师,工程师1,全栈开发人员,男,4.0,0.0180000.037500.0,0.02018年1月30日11:33:27,英特尔,";加利福尼亚州Santa Clara";,软件工程师,九年级,增强现实,男,20.0,5.0204000.050000.020.002018年1月30日11:33:27,英特尔,";加利福尼亚州Santa Clara";,软件工程师,9年级,虚拟现实,男,20.0,5.0204000.050000.020.002018年3月30日11:33:27,Netflix,";科罗拉多州丹佛市;,软件工程师,E5,Web开发(前端),男,20.0,2.05910.0,0.0,0.04/7/2018 11:33:27,索尼互动娱乐,";加利福尼亚州旧金山;,软件工程师,L4,后端工具,男,6.0,6.0103000.05000.032000.02018年5月9日11:33:27,Lyft,";纽约";,数据科学家,t6,算法,男性,6.0,3.0200000.0200000.0,0.02018年11月11日11:33:27,Hudson River Trading,";纽约";,软件工程师,L4,算法,男,6.0,4.0431000.0,0.01700000.004/7/2019 11:33:27,脸书,";伊利诺伊州芝加哥";,产品设计师,IC4,用户体验,女性,7.0,0.0143000.040000.02270004/7/2019 11:33:27,脸书,";纽约";,产品设计师,IC4,ux,女性,7.0,2.0173000.040000,0.04/7/2019 11:33:27,芒果之声,"犹他州盐湖城";,产品设计师,l3,ui,女性,5.0,3.074500.0,0.0,0.02020年9月13日11:33:27,无工资启动,";伊利诺伊州芝加哥";,产品设计师,,用户界面,女性,0.0,0.0,0.0100000.0,0.02021年4月7日11:33:27;伊利诺伊州芝加哥";,,IC4,用户体验,女性,7.0,0.0143000.040000.022700.02021年4月7日11:33:27,推特,";华盛顿特区";,软件工程师,swe II,数据,男,2.0,2.01500000.060000.0,0.0

但对于第二个,它附加到之前测试的CSV中,而不是创建带有日期、年份、月份和季度列的CSV:

date_csv日期,公司,地点,职位,级别,专业,性别,经验年限,公司年限,基本工资,股票,奖金2017年1月1日11:33:27,谷歌,";加利福尼亚州桑尼维尔";,软件工程师,L3,安卓系统,男,1.0,0.0120000.040000.015000.02017年4月20日11:33:27,苹果,";德克萨斯州奥斯汀";,软件工程师,ICT2,iOS开发,女性,1.0,0.09000.030000.020000.02017年7月15日11:33:27,Hubspot,";Cambridge,MA,United States";,软件工程师,初级,现场可靠性(SRE),,,,13500005000.0,0.02017年11月10日11:33:27,Facebook,";Menlo Park,CA";,软件工程师,E5,生产,男,11.0,2.0215000.0100000.040000.012/11/2017 11:33:27,spotify,";纽约";,软件工程师,工程师1,全栈开发人员,男,4.0,0.0180000.037500.0,0.02018年1月30日11:33:27,英特尔,";加利福尼亚州Santa Clara";,软件工程师,九年级,增强现实,男,20.0,5.0204000.050000.020.002018年1月30日11:33:27,英特尔,";加利福尼亚州Santa Clara";,软件工程师,9年级,虚拟现实,男,20.0,5.0204000.050000.020.002018年3月30日11:33:27,Netflix,";科罗拉多州丹佛市;,软件工程师,E5,Web开发(前端),男,20.0,2.05910.0,0.0,0.04/7/2018 11:33:27,索尼互动娱乐,";加利福尼亚州旧金山;,软件工程师,L4,后端工具,男,6.0,6.0103000.05000.032000.02018年5月9日11:33:27,Lyft,";纽约";,数据科学家,t6,算法,男性,6.0,3.0200000.0200000.0,0.02018年11月11日11:33:27,Hudson River Trading,";纽约";,软件工程师,L4,算法,男,6.0,4.0431000.0,0.01700000.004/7/2019 11:33:27,脸书,";伊利诺伊州芝加哥";,产品设计师,IC4,用户体验,女性,7.0,0.0143000.040000.02270004/7/2019 11:33:27,脸书,";纽约";,产品设计师,IC4,ux,女性,7.0,2.0173000.040000,0.04/7/2019 11:33:27,芒果之声,"犹他州盐湖城";,产品设计师,l3,ui,女性,5.0,3.074500.0,0.0,0.02020年9月13日11:33:27,无工资启动,";伊利诺伊州芝加哥";,产品设计师,,用户界面,女性,0.0,0.0,0.0100000.0,0.02021年4月7日11:33:27;伊利诺伊州芝加哥";,,IC4,用户体验,女性,7.0,0.0143000.040000.022700.02021年4月7日11:33:27,推特,";华盛顿特区";,软件工程师,swe II,数据,男,2.0,2.01500000.060000.0,0.0日期、年份、月份、季度2017-01-012017,1,12017-04-202017,4,22017-04-202017,4,22017-07-152017,7,32017-10-112017,10,42017-10-112017,10,42017-12-112017,12,42018-01-302018,1,12018-01-302018,1,12018-03-302018,3,12018-04-072018,4,22018-05-092018,5,22018-11-112018,11,42019-04-072019,4,22019-04-072019,4,22019-04-072019,4,22020-09-132020,9,32021-04-072021,4,22021-04-072021年4月2日

我已经尝试修改类、会话和函数之间pytest fixture的作用域,但没有得到所需的结果。我添加了拆卸代码,在levels_et_with_test_csv_data fixture中的每个测试之后删除"test_data.csv"对象,但这也没有影响。

我的问题从哪里来?

我建议使用不同的csv文件名(例如,像测试名称一样命名它们?)。

相关内容

  • 没有找到相关文章

最新更新