I am testing a class of functions that apply specific transformations to columns of a csv file retrieved from an S3 bucket. The test functions should retrieve the 'test_data.csv' file from the S3 bucket created using the levels_etl and levels_etl_with_test_csv_data fixtures create a new CSV with the transformations applied.
The problem I am having is that each of the test functions pass, when run individually, but when run as part of a class, the first test runs successfully, but all the other tests fail, because for some reason, instead of creating a new CSV with the transformations applied, the CSV output is appended to the CSV created in the previous test thus causing the assertions to fail, with each successive test appending to the CSV.
Setup Code:
@pytest.fixture
def levels_etl():
# Mocking S3 connection start
mock_bucket=mock_s3()
mock_bucket.start()
# Defining Class Arguments
s3_access_key='AWS_ACCESS_KEY_ID'
s3_secret_key='AWS_SECRET_ACCESS_KEY'
s3_endpoint_url='https://s3.us-east-2.amazonaws.com'
s3_bucket_name='test-bucket'
# Creating s3 access keys as environment variables
os.environ[s3_access_key]='KEY1'
os.environ[s3_secret_key]='KEY2'
s3=boto3.resource(service_name='s3',endpoint_url=s3_endpoint_url)
s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={'LocationConstraint':'us-east-2'})
# Creating Test instance
s3_bucket_conn=S3BucketConnector(s3_access_key,s3_secret_key,s3_endpoint_url,s3_bucket_name)
levels_etl=Levels_ETL(s3_bucket_conn)
yield levels_etl
# Teardown
mock_bucket.stop()
@pytest.fixture
def levels_etl_with_test_csv_data(tmpdir_factory,levels_etl):
filename=str(tmpdir_factory.mktemp('data').join('test_data.csv'))
with open(filename,'w',encoding='UTF-8',newline='') as file:
writer=csv.writer(file)
writer.writerow(['date','company','location','title','level','specialisation','gender',
'years_of_experience','years_at_company','base_salary','stock','bonus'])
writer.writerows([['1/1/2017 11:33:27','Google','Sunnyvale, CA','Software Engineer','L3','android',
'male','1','0','120000','40000','15000'],
['4/20/2017 11:33:27','Apple','Austin, TX','Software Engineer','ICT2','iOS Development','female','1','0',
'90','30','20'],
['4/20/2017 11:33:27','Microsoft','Bellevue, WA','Product Manager','59','UX/UI','Male','0','0','0','0','0'],
['7/15/2017 11:33:27','Hubspot','Cambridge, MA, United States','Software Engineer','Junior',
'Site Reliability (SRE)','','','','135','5','0'],
['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
'11','2','215','100','40'],
['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
'11','2','215','100','40'],
['12/11/2017 11:33:27','spotify','New York, NY','Software Engineer','Engineer 1','fullstack developer','male',
'4','0','180','37.5','0'],
['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','augmented reality','male',
'20','5','204','50','20'],
['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','virtual reality','male',
'20','5','204','50','20'],
['3/30/2018 11:33:27','Netflix','Denver, CO','Software Engineer','E5','Web Development (front-end)','male',
'20','2','591','0','0'],
['4/7/2018 11:33:27','Sony Interactive Entertainment','San Francisco, CA','Software Engineer','L4',
'backend tools','male','6','6','103','5','32'],
['5/9/2018 11:33:27','Lyft','New York, NY','Data Scientist','t6','algorithms','male',
'6','3','200','200','0'],
['11/11/2018 11:33:27','Hudson River Trading','New York, NY','Software Engineer','L4',
'algorithm','male','6','4','431','0','1700'],
['4/7/2019 11:33:27','Facebook','Chicago, IL','Product Designer','IC4',
'user experience','female','7','0','143','40','22.7'],
['4/7/2019 11:33:27','Facebook','New York, NY','Product Designer','IC4',
'ux','female','7','2','173','40','0'],
['4/7/2019 11:33:27','Mango Voice','Salt Lake City, UT','Product Designer','l3',
'ui','female','5','3','74.5','0','0'],
['9/13/2020 11:33:27','No Salary Startup','Chicago, IL','Product Designer','',
'user interface','female','0','0','0','100','0'],
['4/7/2021 11:33:27','','Chicago, IL','','IC4','user experience','female','7','0','143','40','22.7'],
['4/7/2021 11:33:27','twitter','Washington, DC','software engineer','swe II',
'data','male','2','2','150','60','0']])
levels_etl.s3_bucket._bucket.upload_file(Filename=filename,Key='test_data.csv')
yield levels_etl
levels_etl.s3_bucket._bucket.delete_objects(Delete={
'Objects':[
{'Key':'test_data.csv'}
]
})
Test Class Functions (2 of many)
def test_transform_job_data(self,levels_etl_with_test_csv_data):
key_exp='test_data.csv'
levels_etl_with_test_csv_data.transform_job_data(key=key_exp)
jobdata_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='job_data.csv').get().get('Body').read().decode('UTF-8')
print('jobdata_csv',jobdata_csv)
job_data_df=pd.read_csv(StringIO(jobdata_csv))
assert list(job_data_df.select_dtypes(include=['float']).columns)==['years_of_experience','years_at_company',
'base_salary','stock','bonus']
assert job_data_df.duplicated().any()==False
assert ((job_data_df['base_salary']==0) & (job_data_df['stock']==0)).any()==False
assert ((job_data_df['company']=='') & (job_data_df['title']=='')).any()==False
assert job_data_df[job_data_df['company']=='Google']['base_salary'].values[0]==120000.00
assert job_data_df[job_data_df['company']=='Google']['stock'].values[0]==40000.00
assert job_data_df[job_data_df['company']=='Google']['bonus'].values[0]==15000.00
assert job_data_df[job_data_df['company']=='Apple']['base_salary'].values[0]==90000.00
assert job_data_df[job_data_df['company']=='Apple']['stock'].values[0]==30000.00
assert job_data_df[job_data_df['company']=='Apple']['bonus'].values[0]==10000.00
def test_transform_dates(self,levels_etl_with_test_csv_data):
key_exp='test_data.csv'
levels_etl_with_test_csv_data.transform_dates(key=key_exp)
date_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='date.csv').get().get('Body').read().decode('UTF-8')
print('date_csv',date_csv)
date_df=pd.read_csv(StringIO(date_csv))
assert list(date_df.columns)==['date','year','month','quarter']
assert date_df['date'].tolist()==['2017-01-01','2017-04-20','2017-04-20','2017-07-15',
'2017-10-11','2017-10-11','2017-12-11','2018-01-30','2018-01-30','2018-03-30','2018-04-07','2018-05-09',
'2018-11-11','2019-04-07','2019-04-07','2019-04-07','2020-09-13','2021-04-07','2021-04-07']
assert date_df['year'].tolist()==[2017,2017,2017,2017,2017,2017,2017,2018,2018,2018,2018,2018,2018,
2019,2019,2019,2020,2021,2021]
date_df['month'].tolist()==[1,4,4,7,10,10,12,1,1,3,4,5,11,4,4,4,9,4,4]
assert date_df['quarter'].tolist()==[1,2,2,3,4,4,4,1,1,1,2,2,4,2,2,2,3,2,2]
The transform_job_data and transform_dates functions both retrieve the 'test_data.csv' file from the S3 bucket, apply pandas dataframe transformations and then convert back to CSV and upload new CSV to S3.
With the first test I get the expected CSV output:
jobdata_csv date,company,location,title,level,specialisation,gender,years_of_experience,years_at_company,base_salary,stock,bonus 1/1/2017 11:33:27,Google,"Sunnyvale, CA",Software Engineer,L3,android,male,1.0,0.0,120000.0,40000.0,15000.0 4/20/2017 11:33:27,Apple,"Austin, TX",Software Engineer,ICT2,iOS Development,female,1.0,0.0,90000.0,30000.0,20000.0 7/15/2017 11:33:27,Hubspot,"Cambridge, MA, United States",Software Engineer,Junior,Site Reliability (SRE),,,,135000.0,5000.0,0.0 10/11/2017 11:33:27,Facebook,"Menlo Park, CA",Software Engineer,E5,production,male,11.0,2.0,215000.0,100000.0,40000.0 12/11/2017 11:33:27,spotify,"New York, NY",Software Engineer,Engineer 1,fullstack developer,male,4.0,0.0,180000.0,37500.0,0.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,augmented reality,male,20.0,5.0,204000.0,50000.0,20000.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,virtual reality,male,20.0,5.0,204000.0,50000.0,20000.0 3/30/2018 11:33:27,Netflix,"Denver, CO",Software Engineer,E5,Web Development (front-end),male,20.0,2.0,591000.0,0.0,0.0 4/7/2018 11:33:27,Sony Interactive Entertainment,"San Francisco, CA",Software Engineer,L4,backend tools,male,6.0,6.0,103000.0,5000.0,32000.0 5/9/2018 11:33:27,Lyft,"New York, NY",Data Scientist,t6,algorithms,male,6.0,3.0,200000.0,200000.0,0.0 11/11/2018 11:33:27,Hudson River Trading,"New York, NY",Software Engineer,L4,algorithm,male,6.0,4.0,431000.0,0.0,1700000.0 4/7/2019 11:33:27,Facebook,"Chicago, IL",Product Designer,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2019 11:33:27,Facebook,"New York, NY",Product Designer,IC4,ux,female,7.0,2.0,173000.0,40000.0,0.0 4/7/2019 11:33:27,Mango Voice,"Salt Lake City, UT",Product Designer,l3,ui,female,5.0,3.0,74500.0,0.0,0.0 9/13/2020 11:33:27,No Salary Startup,"Chicago, IL",Product Designer,,user interface,female,0.0,0.0,0.0,100000.0,0.0 4/7/2021 11:33:27,,"Chicago, IL",,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2021 11:33:27,twitter,"Washington, DC",software engineer,swe II,data,male,2.0,2.0,150000.0,60000.0,0.0
But for the second one, it appends to the CSV from the prior test instead of creating CSV with date, year, month and quarter columns:
date_csv date,company,location,title,level,specialisation,gender,years_of_experience,years_at_company,base_salary,stock,bonus 1/1/2017 11:33:27,Google,"Sunnyvale, CA",Software Engineer,L3,android,male,1.0,0.0,120000.0,40000.0,15000.0 4/20/2017 11:33:27,Apple,"Austin, TX",Software Engineer,ICT2,iOS Development,female,1.0,0.0,90000.0,30000.0,20000.0 7/15/2017 11:33:27,Hubspot,"Cambridge, MA, United States",Software Engineer,Junior,Site Reliability (SRE),,,,135000.0,5000.0,0.0 10/11/2017 11:33:27,Facebook,"Menlo Park, CA",Software Engineer,E5,production,male,11.0,2.0,215000.0,100000.0,40000.0 12/11/2017 11:33:27,spotify,"New York, NY",Software Engineer,Engineer 1,fullstack developer,male,4.0,0.0,180000.0,37500.0,0.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,augmented reality,male,20.0,5.0,204000.0,50000.0,20000.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,virtual reality,male,20.0,5.0,204000.0,50000.0,20000.0 3/30/2018 11:33:27,Netflix,"Denver, CO",Software Engineer,E5,Web Development (front-end),male,20.0,2.0,591000.0,0.0,0.0 4/7/2018 11:33:27,Sony Interactive Entertainment,"San Francisco, CA",Software Engineer,L4,backend tools,male,6.0,6.0,103000.0,5000.0,32000.0 5/9/2018 11:33:27,Lyft,"New York, NY",Data Scientist,t6,algorithms,male,6.0,3.0,200000.0,200000.0,0.0 11/11/2018 11:33:27,Hudson River Trading,"New York, NY",Software Engineer,L4,algorithm,male,6.0,4.0,431000.0,0.0,1700000.0 4/7/2019 11:33:27,Facebook,"Chicago, IL",Product Designer,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2019 11:33:27,Facebook,"New York, NY",Product Designer,IC4,ux,female,7.0,2.0,173000.0,40000.0,0.0 4/7/2019 11:33:27,Mango Voice,"Salt Lake City, UT",Product Designer,l3,ui,female,5.0,3.0,74500.0,0.0,0.0 9/13/2020 11:33:27,No Salary Startup,"Chicago, IL",Product Designer,,user interface,female,0.0,0.0,0.0,100000.0,0.0 4/7/2021 11:33:27,,"Chicago, IL",,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2021 11:33:27,twitter,"Washington, DC",software engineer,swe II,data,male,2.0,2.0,150000.0,60000.0,0.0 date,year,month,quarter 2017-01-01,2017,1,1 2017-04-20,2017,4,2 2017-04-20,2017,4,2 2017-07-15,2017,7,3 2017-10-11,2017,10,4 2017-10-11,2017,10,4 2017-12-11,2017,12,4 2018-01-30,2018,1,1 2018-01-30,2018,1,1 2018-03-30,2018,3,1 2018-04-07,2018,4,2 2018-05-09,2018,5,2 2018-11-11,2018,11,4 2019-04-07,2019,4,2 2019-04-07,2019,4,2 2019-04-07,2019,4,2 2020-09-13,2020,9,3 2021-04-07,2021,4,2 2021-04-07,2021,4,2
I have tried modifying the scopes of the pytest fixtures between class, session and function but I am not getting the desired result. I added teardown code that deletes the 'test_data.csv' object after each test in the levels_etl_with_test_csv_data fixture but that has had no impact either.
Where is my issue coming from?