diff --git a/pycode/memilio-epidata/memilio/epidata/README.rst b/pycode/memilio-epidata/memilio/epidata/README.rst index 5b695cab65..75d0eab8c1 100644 --- a/pycode/memilio-epidata/memilio/epidata/README.rst +++ b/pycode/memilio-epidata/memilio/epidata/README.rst @@ -89,39 +89,52 @@ optional arguments working for all are: +---------------------------------------------+-----------------------------------------------------------+ | -n, --no-raw | Defines if raw data will be stored for further use. | +---------------------------------------------+-----------------------------------------------------------+ +| --no-progress-indicators | Disables all progress indicators (used for downloads etc.)| ++---------------------------------------------+-----------------------------------------------------------+ +| --interactive | Interactive download (Handle warnings, passwords etc.). | ++---------------------------------------------+-----------------------------------------------------------+ +| -v, --verbose | Increases verbosity level. | ++---------------------------------------------+-----------------------------------------------------------+ +| --skip-checks | Skips sanity checks etc. | ++---------------------------------------------+-----------------------------------------------------------+ optional arguments working for some are: +---------------------------------------------+-----------------------------------------------------------+ | -p, --make-plot | Plots the data. | +---------------------------------------------+-----------------------------------------------------------+ -| -ed, --end-date | Changes date for which data collection is stopped [divi] | +| -ed, --end-date | Changes date for which data collection is stopped | +---------------------------------------------+-----------------------------------------------------------+ -| -sd, --start-date | Changes date for which data collection is started [divi] | +| -sd, --start-date | Changes date for which data collection is started | +---------------------------------------------+-----------------------------------------------------------+ | -i, --impute-dates | Returns dataframes with all dates instead of only dates | | | where new cases have been reported. | | | | | | Note that this option will have a negative impact | | | on performance as well as on the storage space needed. | -| | [cases] | +| | | +---------------------------------------------+-----------------------------------------------------------+ | -m N, --moving-average N | The central N days moving average is computed for the | | | data. | | | | | | Note that the --impute_dates option will be implicitly | | | turned on, as computing the moving average requires all | -| | dates to be available. [cases] | +| | dates to be available. | +---------------------------------------------+-----------------------------------------------------------+ | -sb, --split-berlin | Berlin data is split into different counties, | | | instead of having only one county for Berlin. [cases] | +---------------------------------------------+-----------------------------------------------------------+ -| -- rep-date | The reporting date will be prefered over possibly given | +| --rep-date | The reporting date will be prefered over possibly given | | | dates of disease onset. [cases] | +---------------------------------------------+-----------------------------------------------------------+ -| -- sanitize-data | Different ways to distribute vaccinations to home | +| --sanitize-data | Different ways to distribute vaccinations to home | | | locations of vaccinated persons[vaccination] | +---------------------------------------------+-----------------------------------------------------------+ +| --username | Username for regionalstatistik.de [population] | ++---------------------------------------------+-----------------------------------------------------------+ +| --password | Password for regionalstatistik.de [population] | ++---------------------------------------------+-----------------------------------------------------------+ + Hint: When using the "--make-plot" option close one figure-window to get the next one. diff --git a/pycode/memilio-epidata/memilio/epidata/cleanData.py b/pycode/memilio-epidata/memilio/epidata/cleanData.py index 965169503b..179a3ec6e6 100644 --- a/pycode/memilio-epidata/memilio/epidata/cleanData.py +++ b/pycode/memilio-epidata/memilio/epidata/cleanData.py @@ -36,6 +36,7 @@ import os from memilio.epidata import defaultDict as dd +from memilio.epidata import getDataIntoPandasDataFrame as gd def clean_data( @@ -87,7 +88,8 @@ def clean_data( for item in files: if item.endswith(".json") or item.endswith(".h5"): - print("Deleting file ", os.path.join(directory, item)) + gd.default_print("Info", "Deleting file " + + os.path.join(directory, item)) os.remove(os.path.join(directory, item)) # delete directories if empty @@ -95,7 +97,7 @@ def clean_data( os.rmdir(directory) except OSError: continue - print("Deleting directory ", directory) + gd.default_print("Info", "Deleting directory " + directory) # delete further jh files files = [] @@ -106,7 +108,8 @@ def clean_data( for item in files: if item.endswith(".json") or item.endswith(".h5"): - print("Deleting file ", os.path.join(out_path, item)) + gd.default_print("Info", "Deleting file " + + os.path.join(out_path, item)) os.remove(os.path.join(out_path, item)) else: @@ -128,8 +131,8 @@ def clean_data( for item in files: if item.endswith(ending) and "_jh" in item: - print("Deleting file ", - os.path.join(directory, item)) + gd.default_print("Info", "Deleting file " + + os.path.join(directory, item)) os.remove(os.path.join(directory, item)) # delete directories @@ -138,7 +141,7 @@ def clean_data( except OSError: continue - print("Deleting directory ", directory) + gd.default_print("Info", "Deleting directory " + directory) # delete further jh files files = [] @@ -150,7 +153,8 @@ def clean_data( for item in files: if item.endswith(ending): if "_jh" in item or "JohnHopkins" in item: - print("Deleting file ", os.path.join(out_path, item)) + gd.default_print( + "Info", "Deleting file " + os.path.join(out_path, item)) os.remove(os.path.join(out_path, item)) # other data is stored in the same folder @@ -193,20 +197,21 @@ def clean_data( for file in filenames: if file in item: - print( - "Deleting file ", os.path.join( - directory, item)) + gd.default_print("Info", + "Deleting file " + os.path.join( + directory, item)) os.remove(os.path.join(directory, item)) # delete directory if empty try: os.rmdir(directory) - print("Deleting directory ", directory) + gd.default_print("Info", "Deleting directory " + directory) except OSError: pass if filenames == []: - print("Please specify what should be deleted. See --help for details.") + gd.default_print( + "Info", "Please specify what should be deleted. See --help for details.") def cli(): @@ -220,8 +225,8 @@ def cli(): - choose file format: json or hdf5 - define path to files """ - - out_path_default = dd.defaultDict['out_folder'] + conf = gd.Conf(dd.defaultDict['out_folder']) + out_path_default = conf.path_to_use parser = argparse.ArgumentParser() diff --git a/pycode/memilio-epidata/memilio/epidata/customPlot.py b/pycode/memilio-epidata/memilio/epidata/customPlot.py index a9116a1545..5d9bfd5a25 100644 --- a/pycode/memilio-epidata/memilio/epidata/customPlot.py +++ b/pycode/memilio-epidata/memilio/epidata/customPlot.py @@ -89,4 +89,4 @@ def plot_multiple_series( gd.check_dir(path_rel) plt.savefig(path_rel + fig_name + '.png', bbox_inches='tight', dpi=dpi) - print('Plot saved to ' + path_rel + fig_name + '.png') + gd.default_print("Info", 'Plot saved to ' + path_rel + fig_name + '.png') diff --git a/pycode/memilio-epidata/memilio/epidata/download_config.conf b/pycode/memilio-epidata/memilio/epidata/download_config.conf new file mode 100644 index 0000000000..b5e395c13b --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/download_config.conf @@ -0,0 +1,34 @@ +[SETTINGS] +# Set verbosity level +# Off, Critical, Error, Warning, Info, Debug, Trace +# Default: Info +verbosity_level = Info + +# Whether to show Progress Indicator (Spinner, download bars etc.) +# True or False +show_progress = True + +# Whether to run sanity checks etc. +# Will improve performance but may lead to unpredicted behaviour if something in the file format has changed +# True or False +run_checks = True + +# Interactivity of the download functions +# Programm will exit with error if a user choice is needed and interactive is False. +# True or False +interactive = False + +# Defines if plots are generated with matplotlib +# True or False +make_plot = False + +# If out_folder or a different path should be used. +# default uses the definition of default_dict +path_to_use = default + +# If raw data from source should be prevented from being written into the download directory +# True or False +no_raw = False + +# matplotlib backend to use +mpl_backend = TkAgg diff --git a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py index ab47494a9b..231a7a9c75 100644 --- a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py +++ b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py @@ -210,16 +210,19 @@ def check_for_all_counties( missing = len(get_county_ids(merge_berlin, merge_eisenach) )-len(unique_county_list) if missing != 0: - print("Downloaded data is not complete. Missing " + - str(missing) + " counties.") if missing < 0: # Returning True if source data file contains more counties than list - print('Source data frame contains more counties than official ' - 'county list. This could be OK, please verify yourself.') + gd.default_print('Warning', 'Source data frame contains ' + str(abs(missing)) + + ' more counties than official county list. ' + 'This could be OK, please verify yourself.') return True - elif missing < 10: - print('Missing counties: ' + str(list(set(get_county_ids(merge_berlin, - merge_eisenach)).difference(unique_county_list).difference(set({11000}))))) + else: + gd.default_print('Error', "Downloaded data is not complete. Missing " + + str(missing) + " counties.") + if missing < 10: + gd.default_print('Info', 'Missing counties: ' + + str(list(set(get_county_ids(merge_berlin, + merge_eisenach)).difference(unique_county_list).difference(set({11000}))))) # Returning False if source data file lacks at least one county return False diff --git a/pycode/memilio-epidata/memilio/epidata/getCaseData.py b/pycode/memilio-epidata/memilio/epidata/getCaseData.py index 49857b3e2b..859a5f8bcc 100644 --- a/pycode/memilio-epidata/memilio/epidata/getCaseData.py +++ b/pycode/memilio-epidata/memilio/epidata/getCaseData.py @@ -46,7 +46,7 @@ pd.options.mode.copy_on_write = True -def check_for_completeness(df, merge_berlin=False, merge_eisenach=True): +def check_for_completeness(df, run_checks, merge_berlin=False, merge_eisenach=True): """! Checks if all counties are mentioned in the case data set This check had to be added due to incomplete data downloads @@ -57,27 +57,32 @@ def check_for_completeness(df, merge_berlin=False, merge_eisenach=True): @param df pandas dataframe to check @return Boolean to say if data is complete or not """ - - if not df.empty: - return geoger.check_for_all_counties( - df["IdLandkreis"].unique(), - merge_berlin, merge_eisenach) - # if it is empty - return False + if run_checks: + if not df.empty: + return geoger.check_for_all_counties( + df["IdLandkreis"].unique(), + merge_berlin, merge_eisenach) + # if it is empty + return False + else: + # skip checks, return True + # only done if default value in download_config.conf is changed + gd.default_print( + "Warning", "DataFrame has not been checked for completeness.") + return True def get_case_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot'], split_berlin=dd.defaultDict['split_berlin'], rep_date=dd.defaultDict['rep_date'], - files='All' + files='All', + **kwargs ): """! Downloads the case data and provides different kind of structured data @@ -117,17 +122,19 @@ def get_case_data(read_data=dd.defaultDict['read_data'], @param read_data True or False. Defines if data is read from file or downloaded. Default defined in defaultDict. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default 2020-01-01. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates True or False. Defines if values for dates without new information are imputed. Default defined in defaultDict. @param moving_average Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out effects of irregular reporting. Default defined in defaultDict. - @param make_plot True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. @param split_berlin True or False. Defines if Berlin's disctricts are kept separated or get merged. Default defined in defaultDict. @param rep_date True or False. Defines if reporting date or reference date is taken into dataframe. Default defined in defaultDict. @param files List of strings or 'All' or 'Plot'. Defnies which files should be provided (and plotted). Default 'All'. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw + run_checks = conf.checks if files == 'All': files = ['infected', 'deaths', 'all_germany', 'infected_state', @@ -150,8 +157,9 @@ def get_case_data(read_data=dd.defaultDict['read_data'], try: url = "https://media.githubusercontent.com/media/robert-koch-institut/" + \ "SARS-CoV-2-Infektionen_in_Deutschland/main/Aktuell_Deutschland_SarsCov2_Infektionen.csv" - df = gd.get_file(path, url, read_data, param_dict={}, interactive=True) - complete = check_for_completeness(df, merge_eisenach=True) + df = gd.get_file(path, url, read_data, param_dict={}, + interactive=conf.interactive) + complete = check_for_completeness(df, run_checks, merge_eisenach=True) except: pass if complete: @@ -162,26 +170,30 @@ def get_case_data(read_data=dd.defaultDict['read_data'], df["IdBundesland"] = df["IdLandkreis"].map(county_to_state_map) else: # try another possibility if df was empty or incomplete - print("Note: Case data is incomplete. Trying another source.") + gd.default_print( + "Info", "Case data is incomplete. Trying another source.") try: url = "https://opendata.arcgis.com/datasets/66876b81065340a4a48710b062319336_0.csv" # if this file is encoded with utf-8 German umlauts are not displayed correctly because they take two bytes # utf_8_sig can identify those bytes as one sign and display it correctly df = gd.get_file(path, url, False, param_dict={ - "encoding": 'utf_8_sig'}, interactive=True) - complete = check_for_completeness(df, merge_eisenach=True) + "encoding": 'utf_8_sig'}, interactive=conf.interactive) + complete = check_for_completeness( + df, run_checks, merge_eisenach=True) except: pass if not complete: - print("Note: Case data is still incomplete. Trying a thrid source.") + gd.default_print( + "Info", "Case data is still incomplete. Trying a third source.") try: # If the data on github is not available we download the case data from rki from covid-19 datahub url = "https://npgeo-de.maps.arcgis.com/sharing/rest/content/" +\ "items/f10774f1c63e40168479a1feb6c7ca74/data" df = gd.get_file(path, url, False, param_dict={ - "encoding": 'utf_8_sig'}, interactive=True) + "encoding": 'utf_8_sig'}, interactive=conf.interactive) df.rename(columns={'FID': "OBJECTID"}, inplace=True) - complete = check_for_completeness(df, merge_eisenach=True) + complete = check_for_completeness( + df, run_checks, merge_eisenach=True) except: pass if not complete: @@ -275,12 +287,14 @@ def get_case_data(read_data=dd.defaultDict['read_data'], 'infected_state': [[dateToUse, IdBundesland], {AnzahlFall: "sum"}, [IdBundesland], {dd.EngEng["idState"]: geoger.get_state_ids()}, ['Confirmed']], 'all_state': [[dateToUse, IdBundesland], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"}, - [IdBundesland], {dd.EngEng["idState"]: geoger.get_state_ids()}, + [IdBundesland], {dd.EngEng["idState"] + : geoger.get_state_ids()}, ['Confirmed', 'Deaths', 'Recovered']], 'infected_county': [[dateToUse, IdLandkreis], {AnzahlFall: "sum"}, [IdLandkreis], {dd.EngEng["idCounty"]: df[dd.EngEng["idCounty"]].unique()}, ['Confirmed']], 'all_county': [[dateToUse, IdLandkreis], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"}, - [IdLandkreis], {dd.EngEng["idCounty"]: df[dd.EngEng["idCounty"]].unique()}, + [IdLandkreis], {dd.EngEng["idCounty"] + : df[dd.EngEng["idCounty"]].unique()}, ['Confirmed', 'Deaths', 'Recovered']], 'all_gender': [[dateToUse, Geschlecht], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"}, [Geschlecht], {dd.EngEng["gender"]: list( @@ -299,7 +313,8 @@ def get_case_data(read_data=dd.defaultDict['read_data'], ), dd.EngEng["gender"]: list(df[dd.EngEng["gender"]].unique())}, ['Confirmed', 'Deaths', 'Recovered']], 'all_age': [[dateToUse, Altersgruppe], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"}, - [Altersgruppe], {dd.EngEng["ageRKI"]: df[dd.EngEng["ageRKI"]].unique()}, + [Altersgruppe], {dd.EngEng["ageRKI"] + : df[dd.EngEng["ageRKI"]].unique()}, ['Confirmed', 'Deaths', 'Recovered']], 'all_state_age': [[dateToUse, IdBundesland, Altersgruppe], {AnzahlFall: "sum", AnzahlTodesfall: "sum", AnzahlGenesen: "sum"}, [ @@ -353,7 +368,7 @@ def get_case_data(read_data=dd.defaultDict['read_data'], df_local_cs, start_date, end_date) gd.write_dataframe(df_local_cs, directory, filename, file_format) - if make_plot: + if conf.plot: if file == 'infected': # make plot df_local_cs.plot(title='COVID-19 infections', grid=True, diff --git a/pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py b/pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py index 0c1549f761..30b9e7b253 100644 --- a/pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py +++ b/pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py @@ -43,14 +43,13 @@ def get_case_data_with_estimations( read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot'], split_berlin=dd.defaultDict['split_berlin'], - rep_date=dd.defaultDict['rep_date'] + rep_date=dd.defaultDict['rep_date'], + **kwargs ): """! Function to estimate recovered and deaths from combination of case data from RKI and JH data WARNING: This file is experimental and has not been tested. @@ -62,33 +61,33 @@ def get_case_data_with_estimations( @param read_data True or False. Defines if data is read from file or downloaded. Default defined in defaultDict. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default 2020-01-01. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates True or False. Defines if values for dates without new information are imputed. Default defined in defaultDict.data_path @param moving_average Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out effects of irregular reporting. Default defined in defaultDict. - @param make_plot True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. @param split_berlin True or False. Defines if Berlin's disctricts are kept separated or get merged. Default defined in defaultDict. @param rep_date True or False. Defines if reporting date or reference date is taken into dataframe. Default defined in defaultDict. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw + make_plot = conf.plot data_path = os.path.join(out_folder, 'Germany/') if not read_data: - make_plot_cases = False - make_plot_jh = False # get case data gcd.get_case_data( read_data, file_format, out_folder, no_raw, start_date, end_date, - impute_dates, moving_average, make_plot_cases, split_berlin, + impute_dates, moving_average, make_plot, split_berlin, rep_date) # get data from John Hopkins University gjd.get_jh_data( read_data, file_format, out_folder, no_raw, start_date, end_date, - impute_dates, moving_average, make_plot_jh) + impute_dates, moving_average, make_plot) # Now we now which data is generated and we can use it # read in jh data @@ -132,7 +131,8 @@ def get_case_data_with_estimations( df_cases = pd.read_json(case_data_file) # pandas>1.5 raise FileNotFoundError instead of ValueError except (ValueError, FileNotFoundError): - print("WARNING: The file ", file_to_change + ".json does not exist.") + gd.default_print("Warning", "The file " + + file_to_change + ".json does not exist.") continue # generate new columns to store estimated values @@ -168,7 +168,7 @@ def get_case_data_with_estimations( # check if calculation is meaningful # TODO Add jh data to whole germany plot - if make_plot: + if conf.plot: df_cases.plot( x=date, y=[recovered, recovered_estimated], title='COVID-19 check recovered for ' + file_to_change, diff --git a/pycode/memilio-epidata/memilio/epidata/getCommuterMobility.py b/pycode/memilio-epidata/memilio/epidata/getCommuterMobility.py index a184ce0ada..f87cd78b77 100644 --- a/pycode/memilio-epidata/memilio/epidata/getCommuterMobility.py +++ b/pycode/memilio-epidata/memilio/epidata/getCommuterMobility.py @@ -46,11 +46,11 @@ def verify_sorted(countykey_list): if countykey_list_is_sorted: return True else: - print('Error. Input list not sorted.') + gd.default_print('Error', 'Input list not sorted.') return False -def assign_geographical_entities(countykey_list, govkey_list): +def assign_geographical_entities(countykey_list, govkey_list, run_checks): """! Assigns counties to governing regions based on key comparison and creates list of governing regions per state. Only works with sorted key lists. @@ -65,9 +65,12 @@ def assign_geographical_entities(countykey_list, govkey_list): @return gov_county_table Table of county regional keys per governing region. @return state_gov_table Table of governing region regional keys per federal state. """ - - if verify_sorted(countykey_list) == False: - raise gd.DataError("Error. Input list not sorted.") + if run_checks: + if verify_sorted(countykey_list) == False: + raise gd.DataError("Error. Input list not sorted.") + else: + gd.default_print( + 'Warning', 'List of county regional keys has not been verified to be sorted.') # Create list of government regions with lists of counties that belong to them and list of states with government regions that belong to them; only works with sorted lists of keys. gov_county_table = [] @@ -96,7 +99,7 @@ def assign_geographical_entities(countykey_list, govkey_list): gov_county_table.append(col_list) if len(gov_county_table) != len(govkey_list): - print('Error. Number of government regions wrong.') + gd.default_print('Error', 'Number of government regions wrong.') # create a unique hash map from county key to its government region and # a global key to local (in gov region) key ordering @@ -132,10 +135,9 @@ def assign_geographical_entities(countykey_list, govkey_list): def get_commuter_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], - make_plot=dd.defaultDict['make_plot'], setup_dict='', - ref_year=2022): + ref_year=2022, + **kwargs): """! Computes DataFrame of commuter migration patterns based on the Federal Agency of Work data. @@ -144,8 +146,6 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'], Only for population data. Commuter data is always downloaded. Default defined in defaultDict. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw [Currently not used] True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. - @param make_plot [Currently not used] True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. @param setup_dict dictionary with necessary values: 'path': String with datapath where migration files can be found 'abs_tol': tolerated undetected people @@ -158,6 +158,10 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'], The printed errors are refering to the absolute and relative errors from included numbers per county in DataFrame and this cumulative values. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw + if setup_dict == '': abs_tol = 100 # maximum absolute error allowed per county migration rel_tol = 0.01 # maximum relative error allowed per county migration @@ -195,7 +199,7 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'], states[state_id_file] + '_' + str(ref_year) filepath = os.path.join(mobility_dir) + filename + '.json' commuter_migration_files[state_id_file] = gd.get_file( - filepath, url, read_data, param_dict, interactive=True) + filepath, url, read_data, param_dict, interactive=conf.interactive) if not no_raw: gd.write_dataframe( commuter_migration_files[state_id_file], mobility_dir, filename, 'json') @@ -219,7 +223,7 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'], zip(govkey_list, list(range(len(govkey_list))))) (countykey2govkey, countykey2localnumlist, gov_county_table, - state_gov_table) = assign_geographical_entities(countykey_list, govkey_list) + state_gov_table) = assign_geographical_entities(countykey_list, govkey_list, conf.checks) mat_commuter_migration = np.zeros( [len(countykey_list), len(countykey_list)]) @@ -437,19 +441,19 @@ def get_commuter_data(read_data=dd.defaultDict['read_data'], if abs_err < setup_dict['abs_tol'] and abs_err / checksum < setup_dict['rel_tol']: checksum = 0 else: - print('Error in calculations for county ', curr_county_migratedto, - '\nAccumulated values:', checksum, - ', correct sum:', commuter_migration_file.iloc[i, 4]) - print('Absolute error:', abs_err, - ', relative error:', abs_err / checksum) + gd.default_print('Warning', 'Error in calculations for county ' + str(curr_county_migratedto) + + '\nAccumulated values:' + str(checksum) + + ', correct sum:' + str(commuter_migration_file.iloc[i, 4])) + gd.default_print('Debug', 'Absolute error:' + str(abs_err) + + ', relative error:' + str(abs_err / checksum)) if np.isnan(mat_commuter_migration).any(): raise gd.DataError( 'NaN encountered in mobility matrix, exiting ' 'getCommuterMobility(). Mobility data will be incomplete.') - print('Maximum absolute error:', max_abs_err) - print('Maximum relative error:', max_rel_err) + gd.default_print('Debug', 'Maximum absolute error:' + str(max_abs_err)) + gd.default_print('Debug', 'Maximum relative error:' + str(max_rel_err)) countykey_list = [int(id) for id in countykey_list] df_commuter_migration = pd.DataFrame( @@ -504,7 +508,7 @@ def commuter_sanity_checks(df): def get_neighbors_mobility( countyid, direction='both', abs_tol=0, rel_tol=0, tol_comb='or', - out_folder=dd.defaultDict['out_folder'], ref_year=2022): + out_folder=dd.defaultDict['out_folder'], ref_year=2022, **kwargs): '''! Returns the neighbors of a particular county ID depening on the commuter mobility and given absolute and relative thresholds on the number of commuters. @@ -540,8 +544,10 @@ def get_neighbors_mobility( commuter = gd.get_file(os.path.join( directory, "migration_bfa_"+str(ref_year)+"_dim400.json"), read_data=True) except FileNotFoundError: - print("Commuter data was not found. Download and process it from the internet.") - commuter = get_commuter_data(out_folder=out_folder, ref_year=ref_year) + gd.default_print( + "Info", "Commuter data was not found. Download and process it from the internet.") + commuter = get_commuter_data( + out_folder=out_folder, ref_year=ref_year, **kwargs) countykey_list = commuter.columns commuter.index = countykey_list diff --git a/pycode/memilio-epidata/memilio/epidata/getDIVIData.py b/pycode/memilio-epidata/memilio/epidata/getDIVIData.py index e91054e423..8e7b5317df 100644 --- a/pycode/memilio-epidata/memilio/epidata/getDIVIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getDIVIData.py @@ -45,19 +45,15 @@ from memilio.epidata import getDataIntoPandasDataFrame as gd from memilio.epidata import modifyDataframeSeries as mdfs -# activate CoW for more predictable behaviour of pandas DataFrames -pd.options.mode.copy_on_write = True - def get_divi_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=date(2020, 4, 24), end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot'] + **kwargs ): """! Downloads or reads the DIVI ICU data and writes them in different files. @@ -77,20 +73,22 @@ def get_divi_data(read_data=dd.defaultDict['read_data'], @param read_data True or False. Defines if data is read from file or downloaded. Default defined in defaultDict. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default defined in defaultDict. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates True or False. Defines if values for dates without new information are imputed. Default defined in defaultDict. @param moving_average Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out effects of irregular reporting. Default defined in defaultDict. - @param make_plot [Currently not used] True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw + # First csv data on 24-04-2020 if start_date < date(2020, 4, 24): - print("Warning: First data available on 2020-04-24. " - "You asked for " + start_date.strftime("%Y-%m-%d") + - ". Changed it to 2020-04-24.") + gd.default_print('Warning', "First data available on 2020-04-24. " + "You asked for " + start_date.strftime("%Y-%m-%d") + + ". Changed it to 2020-04-24.") start_date = date(2020, 4, 24) directory = os.path.join(out_folder, 'Germany/') @@ -101,14 +99,19 @@ def get_divi_data(read_data=dd.defaultDict['read_data'], "Intensivkapazitaeten_und_COVID-19-Intensivbettenbelegung_in_Deutschland/"\ "main/Intensivregister_Landkreise_Kapazitaeten.csv" path = os.path.join(directory + filename + ".json") - df_raw = gd.get_file(path, url, read_data, param_dict={}, interactive=True) + df_raw = gd.get_file(path, url, read_data, param_dict={}, + interactive=conf.interactive) if not df_raw.empty: if not no_raw: gd.write_dataframe(df_raw, directory, filename, file_format) else: raise gd.DataError("Something went wrong, dataframe is empty.") - divi_data_sanity_checks(df_raw) + if conf.checks == True: + divi_data_sanity_checks(df_raw) + else: + gd.default_print( + "Warning", "Sanity checks for DIVI data have not been executed.") df = df_raw.rename(dd.GerEng, axis=1, inplace=False) try: diff --git a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py index bbc4236e36..d07e6dc019 100644 --- a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py +++ b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py @@ -28,24 +28,112 @@ - check if directory exists and if not creates it - writes pandas dataframe to file of three different formats """ - +import sys import os import argparse +import configparser import datetime import requests import magic import urllib3 +import warnings +import matplotlib from io import BytesIO from zipfile import ZipFile -from warnings import warn +from enum import Enum import pandas as pd from memilio.epidata import defaultDict as dd from memilio.epidata import progress_indicator -# activate CoW for more predictable behaviour of pandas DataFrames -pd.options.mode.copy_on_write = True + +class VerbosityLevel(Enum): + Off = 0 + Critical = 1 + Error = 2 + Warning = 3 + Info = 4 + Debug = 5 + Trace = 6 + + +class Conf: + """Configures all relevant download outputs etc.""" + + v_level = 'Info' + show_progr = False + + def __init__(self, out_folder, **kwargs): + + # change v_level from int to str + if 'verbosity_level' in kwargs.keys(): + if isinstance(kwargs['verbosity_level'], int): + kwargs['verbosity_level'] = VerbosityLevel( + kwargs['verbosity_level']).name + + path = os.path.join(os.path.dirname( + os.path.abspath(__file__)), 'download_config.conf') + + # activate CoW for more predictable behaviour of pandas DataFrames + pd.options.mode.copy_on_write = True + + # read in config file + # if no config file is given, use default values + if os.path.exists(path): + parser = configparser.ConfigParser() + parser.read(path) + # all values will be read in as string + + if parser['SETTINGS']['path_to_use'] == 'default': + self.path_to_use = out_folder + else: + self.path_to_use = parser['SETTINGS']['path_to_use'] + + matplotlib.use(str(parser['SETTINGS']['mpl_backend'])) + + # merge kwargs with config data + # Do not overwrite kwargs, just add from parser + for key in parser['SETTINGS']: + if key not in kwargs: + kwargs.update({key: parser['SETTINGS'][key]}) + + Conf.show_progr = True if kwargs['show_progress'] == 'True' else False + Conf.v_level = str(kwargs['verbosity_level']) + self.checks = True if kwargs['run_checks'] == 'True' else False + self.interactive = True if kwargs['interactive'] == 'True' else False + self.plot = True if kwargs['make_plot'] == 'True' else False + self.no_raw = True if kwargs['no_raw'] == 'True' else False + else: + # default values: + Conf.show_progr = kwargs['show_progress'] if 'show_progress' in kwargs.keys( + ) else Conf.show_progr + Conf.v_level = kwargs['verbosity_level'] if 'verbosity_level' in kwargs.keys( + ) else Conf.v_level + self.checks = kwargs['run_checks'] if 'run_checks' in kwargs.keys( + ) else True + self.interactive = kwargs['interactive'] if 'interactive' in kwargs.keys( + ) else False + self.plot = kwargs['make_plot'] if 'make_plot' in kwargs.keys( + ) else dd.defaultDict['make_plot'] + self.no_raw = kwargs['no_raw'] if 'no_raw' in kwargs.keys( + ) else dd.defaultDict['no_raw'] + self.path_to_use = out_folder + + # suppress Future & DepricationWarnings + if VerbosityLevel[Conf.v_level].value <= 2: + warnings.simplefilter(action='ignore', category=FutureWarning) + warnings.simplefilter(action='ignore', category=DeprecationWarning) + # deactivate (or activate progress indicator) + if Conf.show_progr == True: + progress_indicator.ProgressIndicator.disable_indicators(False) + else: + progress_indicator.ProgressIndicator.disable_indicators(True) + + +def default_print(verbosity_level, message): + if VerbosityLevel[verbosity_level].value <= VerbosityLevel[Conf.v_level].value: + print(verbosity_level + ": " + message) def user_choice(message, default=False): @@ -80,9 +168,9 @@ def download_file( @return File as BytesIO """ if verify not in [True, False, "interactive"]: - warn('Invalid input for argument verify. Expected True, False, or' - ' "interactive", got ' + str(verify) + '.' - ' Proceeding with "verify=True".', category=RuntimeWarning) + warnings.warn('Invalid input for argument verify. Expected True, False, or' + ' "interactive", got ' + str(verify) + '.' + ' Proceeding with "verify=True".', category=RuntimeWarning) verify = True # send GET request as stream so the content is not downloaded at once try: @@ -237,11 +325,9 @@ def cli(what): If the key is not part of the dictionary the program is stopped. The following default arguments are added to the parser: - - read-from-disk + - read-file - file-format, choices = ['json', 'hdf5', 'json_timeasstring'] - out_path - - no_raw - - no_progress_indicators (excluded from dict) The default values are defined in default dict. Depending on what following parser can be added: @@ -253,6 +339,13 @@ def cli(what): - split_berlin - rep_date - sanitize_data + - no_progress_indicator + - interactive + - verbose + - skip_checks + - no_raw + - username + - password @param what Defines what packages calls and thus what kind of command line arguments should be defined. """ @@ -263,16 +356,16 @@ def cli(what): # "plot": ['cases'], # "start_date": ['divi'] } - cli_dict = {"divi": ['Downloads data from DIVI', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot'], - "cases": ['Download case data from RKI', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot', 'split_berlin', 'rep_date'], - "cases_est": ['Download case data from RKI and JHU and estimate recovered and deaths', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot', 'split_berlin', 'rep_date'], + cli_dict = {"divi": ['Downloads data from DIVI', 'start_date', 'end_date', 'impute_dates', 'moving_average'], + "cases": ['Download case data from RKI', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'split_berlin', 'rep_date'], + "cases_est": ['Download case data from RKI and JHU and estimate recovered and deaths', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'split_berlin', 'rep_date'], "population": ['Download population data from official sources', 'username'], - "commuter_official": ['Download commuter data from official sources', 'make_plot'], - "vaccination": ['Download vaccination data', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot', 'sanitize_data'], - "testing": ['Download testing data', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot'], - "jh": ['Downloads data from Johns Hopkins University', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot'], - "hospitalization": ['Download hospitalization data', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot'], - "sim": ['Download all data needed for simulations', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'make_plot', 'split_berlin', 'rep_date', 'sanitize_data']} + "commuter_official": ['Download commuter data from official sources'], + "vaccination": ['Download vaccination data', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'sanitize_data'], + "testing": ['Download testing data', 'start_date', 'end_date', 'impute_dates', 'moving_average'], + "jh": ['Downloads data from Johns Hopkins University', 'start_date', 'end_date', 'impute_dates', 'moving_average'], + "hospitalization": ['Download hospitalization data', 'start_date', 'end_date', 'impute_dates', 'moving_average'], + "sim": ['Download all data needed for simulations', 'start_date', 'end_date', 'impute_dates', 'moving_average', 'split_berlin', 'rep_date', 'sanitize_data']} try: what_list = cli_dict[what] @@ -300,10 +393,6 @@ def cli(what): parser.add_argument('-o', '--out-folder', type=str, default=out_path_default, help='Defines folder for output.') - parser.add_argument( - '-n', '--no-raw', default=dd.defaultDict['no_raw'], - help='Defines if raw data will be stored for further use.', - action='store_true') if 'start_date' in what_list: if what == 'divi': @@ -334,9 +423,6 @@ def cli(what): parser.add_argument( '-m', '--moving-average', type=int, default=dd.defaultDict['moving_average'], help='Compute a moving average of N days over the time series. Default is ' + str(dd.defaultDict['moving_average'])) - if 'make_plot' in what_list: - parser.add_argument('-p', '--make-plot', default=dd.defaultDict['make_plot'], help='Plots the data.', - action='store_true') if 'split_berlin' in what_list: parser.add_argument( '-b', '--split-berlin', default=dd.defaultDict['split_berlin'], @@ -351,14 +437,42 @@ def cli(what): action='store_true') if 'sanitize_data' in what_list: parser.add_argument( - '-sd', '--sanitize_data', type=int, default=dd.defaultDict['sanitize_data'], + '-sd', '--sanitize-data', type=int, default=dd.defaultDict['sanitize_data'], dest='sanitize_data', help='Redistributes cases of every county either based on regions ratios or on thresholds and population' ) - parser.add_argument( - '--no-progress-indicators', - help='Disables all progress indicators (used for downloads etc.).', - action='store_true') + # add optional download options + if '--no-progress-indicators' in sys.argv: + parser.add_argument( + '--no-progress-indicators', dest='show_progress', + help='Disables all progress indicators (used for downloads etc.).', + action='store_false') + + if not {'--no-raw', '-n'}.isdisjoint(sys.argv): + parser.add_argument( + '-n', '--no-raw', + help='Defines if raw data will be stored for further use.', + action='store_true') + + if not {'--make_plot', '-p'}.isdisjoint(sys.argv): + parser.add_argument('-p', '--make-plot', + help='Plots the data.', action='store_true') + + if '--interactive' in sys.argv: + parser.add_argument( + '--interactive', + help='Interactive download (Handle warnings, passwords etc.).', action='store_true') + + if not {'--verbose', '-v', '-vv', '-vvv', '-vvvv', '-vvvvv', '-vvvvvv'}.isdisjoint(sys.argv): + parser.add_argument( + '-v', '--verbose', dest='verbosity_level', + help='Increases verbosity level (Trace, Debug, Info, Warning, Error, Critical, Off).', + action='count', default=0) + + if '--skip-checks' in sys.argv: + parser.add_argument( + '--skip-checks', dest='run_checks', action='store_false', + help='Skips sanity checks etc.') if 'username' in what_list: parser.add_argument( @@ -369,12 +483,6 @@ def cli(what): '--password', type=str ) args = vars(parser.parse_args()) - # disable progress indicators globally, if the argument --no-progress-indicators was specified - progress_indicator.ProgressIndicator.disable_indicators( - args["no_progress_indicators"]) - # remove the no_progress_indicators entry from the dict - # (after disabling indicators, its value is no longer usefull) - args.pop("no_progress_indicators") return args @@ -465,7 +573,7 @@ def write_dataframe(df, directory, file_prefix, file_type, param_dict={}): elif file_type == "txt": df.to_csv(out_path, **outFormSpec) - print("Information: Data has been written to", out_path) + default_print('Info', "Data has been written to " + out_path) class DataError(Exception): diff --git a/pycode/memilio-epidata/memilio/epidata/getHospitalizationData.py b/pycode/memilio-epidata/memilio/epidata/getHospitalizationData.py index 29643e7f0f..b7c45d8916 100644 --- a/pycode/memilio-epidata/memilio/epidata/getHospitalizationData.py +++ b/pycode/memilio-epidata/memilio/epidata/getHospitalizationData.py @@ -56,7 +56,7 @@ def hospit_sanity_checks(df): actual_strings_list = df.columns.tolist() # check number of data categories if len(actual_strings_list) != 6: - print("Warning: Number of data categories changed.") + gd.default_print("Warning", "Number of data categories changed.") # these strings need to be in the header test_strings = { @@ -119,7 +119,8 @@ def get_hospitailzations_per_day(seven_days_values): # break after 5 runs to prevent endless loop if run == 5: - print("Can't get hospitalizations per day from incidence.") + gd.default_print( + "Error", "Can't get hospitalizations per day from incidence.") if len(daily_values[daily_values < 0]) > 0: raise gd.DataError('Negative hospitalizations found.') # check that daily values are calculated correctly @@ -136,12 +137,11 @@ def get_hospitailzations_per_day(seven_days_values): def get_hospitalization_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot'] + **kwargs ): """! Downloads or reads the RKI hospitalization data and writes them in different files. @@ -160,16 +160,17 @@ def get_hospitalization_data(read_data=dd.defaultDict['read_data'], @param read_data True or False. Defines if data is read from file or downloaded. Default defined in defaultDict. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default defined in defaultDict. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates True or False. Defines if values for dates without new information are imputed. Default defined in defaultDict. Here Dates are always imputed so False changes nothing. @param moving_average [Currently not used] Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out weekend effects. Default defined in defaultDict. - @param make_plot [currently not used] True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. """ impute_dates = True + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw directory = os.path.join(out_folder, 'Germany/') gd.check_dir(directory) @@ -177,9 +178,14 @@ def get_hospitalization_data(read_data=dd.defaultDict['read_data'], filename = "RKIHospitFull" url = "https://raw.githubusercontent.com/robert-koch-institut/COVID-19-Hospitalisierungen_in_Deutschland/master/Aktuell_Deutschland_COVID-19-Hospitalisierungen.csv" path = os.path.join(directory + filename + ".json") - df_raw = gd.get_file(path, url, read_data, param_dict={}, interactive=True) - - hospit_sanity_checks(df_raw) + df_raw = gd.get_file(path, url, read_data, param_dict={}, + interactive=conf.interactive) + + if conf.checks == True: + hospit_sanity_checks(df_raw) + else: + gd.default_print( + 'Warning', "Sanity checks for hospitalization data have not been executed.") if not no_raw: gd.write_dataframe(df_raw, directory, filename, file_format) diff --git a/pycode/memilio-epidata/memilio/epidata/getJHData.py b/pycode/memilio-epidata/memilio/epidata/getJHData.py index c4ad55e3ff..6ae48edb23 100644 --- a/pycode/memilio-epidata/memilio/epidata/getJHData.py +++ b/pycode/memilio-epidata/memilio/epidata/getJHData.py @@ -39,12 +39,10 @@ def get_jh_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=date(2020, 1, 22), end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], - moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot']): + **kwargs): """! Download data from John Hopkins University Data is either downloaded and afterwards stored or loaded from a stored filed. @@ -61,31 +59,33 @@ def get_jh_data(read_data=dd.defaultDict['read_data'], @param read_data True or False. Defines if data is read from file or downloaded. Default defined in defaultDict. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default defined in defaultDict. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates [Currently not used] True or False. Defines if values for dates without new information are imputed. Default defined in defaultDict. @param moving_average [Currently not used] Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out effects of irregular reporting. Default defined in defaultDict. - @param make_plot [Currently not used] True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw if start_date < date(2020, 1, 22): - print("Warning: First data available on 2020-01-22. " - "You asked for " + start_date.strftime("%Y-%m-%d") + - ". Changed it to 2020-01-22.") + gd.default_print("Warning", "First data available on 2020-01-22. " + "You asked for " + start_date.strftime("%Y-%m-%d") + + ". Changed it to 2020-01-22.") start_date = date(2020, 1, 22) filename = "FullData_JohnHopkins" url = "https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv" path = os.path.join(out_folder, filename + ".json") - df = gd.get_file(path, url, read_data, param_dict={}, interactive=True) + df = gd.get_file(path, url, read_data, param_dict={}, + interactive=conf.interactive) if not no_raw: gd.write_dataframe(df, out_folder, filename, "json") df.rename({'Country/Region': 'CountryRegion', 'Province/State': 'ProvinceState'}, axis=1, inplace=True) - print("Available columns:", df.columns) + gd.default_print("Debug", "Available columns: " + df.columns) # extract subframe of dates df = mdfs.extract_subframe_based_on_dates(df, start_date, end_date) @@ -143,9 +143,6 @@ def get_jh_data(read_data=dd.defaultDict['read_data'], gd.write_dataframe(gb.reset_index(), out_folder, "all_provincestate_jh", file_format) - # print(dfD[dfD.ProvinceState=="Saskatchewan"]) - # print(gb.reset_index()[gb.reset_index().ProvinceState=="Saskatchewan"]) - # TODO: How to handle empty values which become NaN in the beginnin but after woking on the data its just 0.0 # One solution is to preserve them with : df['b'] = df['b'].astype(str) # However, what to do with the cases where after some times values occur? Do those cases exist? diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 91e2676eea..02887c57e1 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -118,13 +118,13 @@ def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, def print_manual_download(filename, url): """! Print download message to ask the user manually download a file. """ - print( - 'This script needs manual downloading of files. Please register' - ' at corona-datenplatform.com and download ' + filename + ' from ' + url + - '. Then move it to a folder named raw_data in this directory.') + gd.default_print("Error", + 'This script needs manual downloading of files. Please register' + ' at corona-datenplatform.com and download ' + filename + ' from ' + url + + '. Then move it to a folder named raw_data in this directory.') -def read_files(directory, fine_resolution): +def read_files(directory, fine_resolution, run_checks): """! Reads files from local directory and returns data in dataframes @param directory Directory where data is loaded from. @@ -217,7 +217,7 @@ def read_files(directory, fine_resolution): for tcode in test_codes: for i in [''] + ["_" + str(i) for i in range(1, 6)]: if (df_npis_old[df_npis_old[dd.EngEng['npiCode']] == tcode+i].iloc[:, 6:].max().max() > 0): - print(tcode+i + " used.") + gd.default_print("Debug", tcode+i + " used.") # end check else: # read aggregated NPIs @@ -252,15 +252,19 @@ def read_files(directory, fine_resolution): # download combinations of npis try: + fname = 'combination_npis_incl_ranking.xlsx' if fine_resolution > 0: df_npis_combinations_pre = pd.read_excel( os.path.join( - directory, 'combination_npis_incl_ranking.xlsx'), engine='openpyxl') + directory, fname), engine='openpyxl') except FileNotFoundError: - print('File not found.') - raise FileNotFoundError + raise FileNotFoundError('File ' + fname + ' not found.') - npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre) + if run_checks: + npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre) + else: + gd.default_print( + 'Warning', "Sanity checks for NPI data have not been executed.") return df_npis_old, df_npis_desc, df_npis_combinations_pre @@ -476,7 +480,8 @@ def get_npi_data(fine_resolution=2, end_date=dd.defaultDict['end_date'], counties_considered=geoger.get_county_ids(), npi_activation_days_threshold=3, - npi_lifting_days_threshold=5 + npi_lifting_days_threshold=5, + **kwargs ): """! Loads a certain resolution of recorded NPI data from the Corona Datenplattform and extracts the counties asked for and @@ -521,6 +526,8 @@ def get_npi_data(fine_resolution=2, @param npi_alifting_days_threshold [Default: 5]. Defines necessary number of days below case incidence threshold threshold to lift NPIs. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use # Depending on the federal state and time period, there are # huge differences for number of days before the lifting and activation. @@ -544,9 +551,9 @@ def get_npi_data(fine_resolution=2, # read manual downloaded files from directory df_npis_old, df_npis_desc, df_npis_combinations_pre = read_files( - directory, fine_resolution) + directory, fine_resolution, conf.checks) - print('Download completed.') + gd.default_print('Debug', 'Download completed.') # Compute column index of NPI start (columns with NPIs start with days # which are provided in format dYYYYMMDD). @@ -650,7 +657,8 @@ def get_npi_data(fine_resolution=2, df_npis_combinations[npic_uniq][1] = df_npis_combinations_pre.iloc[np.array(npi_groups_idx[i]), start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values if (df_npis_combinations[npic_uniq][1]-np.transpose(df_npis_combinations[npic_uniq][1])).max() > 0: - print('Error in input file: Please correct combination matrix input.') + gd.default_print( + 'Error', 'Input file does not match with data. Please correct combination matrix input.') # make it a dataframe to allow easy removal of code lines and rows # if they are not used later on df_npis_combinations[npic_uniq][1] = pd.DataFrame( @@ -687,7 +695,7 @@ def get_npi_data(fine_resolution=2, directory, 'combinations_npis_cleanoutput.xlsx'), sheet_name=i, engine='openpyxl') if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out): - print('Error in combination matrix.') + gd.default_print('Error', 'Error in combination matrix.') del df_in_valid else: df_out.to_excel( @@ -720,7 +728,8 @@ def get_npi_data(fine_resolution=2, if not dummy_a[i] == dummy_c[i]: errors.append(i) if not errors == [0, 1, 2, 3, 4, 5]: - print("Additional errors in consistent naming.") + gd.default_print( + "Error", "Additional errors in consistent naming.") # End of check # correct for consistent naming (mainly done for plotting reasons, @@ -861,11 +870,11 @@ def get_npi_data(fine_resolution=2, for i in range(len(dates_new) - 1)] date_diff_idx = np.where(np.array(date_diff) > 1)[0] if max(date_diff) > 1: - print("Error. Dates missing in data frame:") + gd.default_print("Error", "Dates missing in data frame:") for i in date_diff_idx: - print( - "\t - From " + str(dates_new[i] + timedelta(1)) + " until " + - str(dates_new[i] + timedelta(date_diff[i] - 1))) + gd.default_print("Debug", + "\t - From " + str(dates_new[i] + timedelta(1)) + " until " + + str(dates_new[i] + timedelta(date_diff[i] - 1))) raise gd.DataError('Exiting. Dates missing in data frame.') min_date = [] @@ -1004,8 +1013,6 @@ def get_npi_data(fine_resolution=2, # In order to avoid contradictions, only retain the strictest mentioned # implementation. Incidence-independent is always stricter than any # incidence-dependent implementation. - # define if details are printed (probably to be deactivated) - print_details = True for i in range(int(len(df_local_old)/inc_codes)): # check if access is correct @@ -1018,10 +1025,10 @@ def get_npi_data(fine_resolution=2, sum_npi_inc = np.where( df_local_old.iloc[inc_codes*i:inc_codes*(i+1), npi_start_col:].sum() > 1) - if (len(sum_npi_inc[0]) > 0) and print_details: - print( - 'Reduce multiple prescription in county ' + str(countyID) + - ' for NPI ' + str(npis.loc[inc_codes*i, 'Description'])) + if (len(sum_npi_inc[0]) > 0): + gd.default_print("Trace", + 'Reduce multiple prescription in county ' + str(countyID) + + ' for NPI ' + str(npis.loc[inc_codes*i, 'Description'])) for j in sum_npi_inc[0]: # get lowest index (i.e., strictest implementation of NPI). idx_start = np.where( @@ -1174,11 +1181,9 @@ def get_npi_data(fine_resolution=2, days_deact = np.where( df_local_new_merged.loc[subcode_active, nocombi_code] > 0)[0] if len(days_deact) > 0: - print('Deactivating for ' + - 'County ' + str(countyID)) - print('\t' + str(nocombi_code) + ' due to ' + - str(subcode) + ' on ' + str(len(days_deact)) + ' days.') - print('\n') + gd.default_print("Trace", 'Deactivating for ' + + 'County ' + str(countyID)+'\t' + str(nocombi_code) + ' due to ' + + str(subcode) + ' on ' + str(len(days_deact)) + ' days.\n') # take subcode_active rows as days_deact is # numbering inside subcode_active rows only, # not numbering on the whole df_local_new_merged @@ -1225,22 +1230,22 @@ def get_npi_data(fine_resolution=2, # print progress if countyidx == 1 or countyidx % int( len(counties_considered) / 10) == 0: - print('Progress ' + str(countyidx) + ' / ' + - str(len(counties_considered)) + - '. Estimated time remaining: ' + - str(int(time_remain / 60)) + ' min.') + gd.default_print('Debug', 'Progress ' + str(countyidx) + ' / ' + + str(len(counties_considered)) + + '. Estimated time remaining: ' + + str(int(time_remain / 60)) + ' min.') save_interaction_matrix(df_count_deactivation, 'count_deactivation', directory) plot_interaction_matrix('count_deactivation', directory) if counter_cases_start >= len(counties_considered)*0.05: - print('WARNING: DataFrame starts with reported cases > 0 ' - 'for more than 5 percent of the counties to be considered. ' - 'In this case, incidence computation and activation of ' - 'incidence-dependent NPIs cannot be ensured to work correctly. ' - 'Please consider a start date of some weeks ahead of the ' - 'time window to be analyzed for NPI\'s effects.') + gd.default_print('Warning', 'DataFrame starts with reported cases > 0 ' + 'for more than 5 percent of the counties to be considered. ' + 'In this case, incidence computation and activation of ' + 'incidence-dependent NPIs cannot be ensured to work correctly. ' + 'Please consider a start date of some weeks ahead of the ' + 'time window to be analyzed for NPI\'s effects.') save_interaction_matrix(df_count_incid_depend, 'joint_codes_incid_depend', directory) @@ -1250,8 +1255,7 @@ def get_npi_data(fine_resolution=2, plot_interaction_matrix('joint_codes_active', directory) # print sub counters - print('Sub task counters are: ') - print(counters) + gd.default_print('Debug', 'Sub task counters are: '+str(counters)) # reset index and drop old index column df_npis.reset_index(inplace=True) @@ -1278,8 +1282,8 @@ def get_npi_data(fine_resolution=2, start_date_new, end_date_new, fine_resolution) if (a != b): - print('Error in NPI activation computation') - print(a, b, a - b) + gd.default_print('Error', 'Error in NPI activation computation' + + str(a) + str(b) + str(a - b)) #### end validation #### diff --git a/pycode/memilio-epidata/memilio/epidata/getPopulationData.py b/pycode/memilio-epidata/memilio/epidata/getPopulationData.py index 4547083689..b5c13f2470 100644 --- a/pycode/memilio-epidata/memilio/epidata/getPopulationData.py +++ b/pycode/memilio-epidata/memilio/epidata/getPopulationData.py @@ -96,8 +96,18 @@ def read_population_data(username, password, read_data, directory): return df_pop_raw +# This function is needed for unittests +# Fakefilesystem has problems with os.path -def manage_credentials(): + +def path_to_credential_file(): + '''Returns path to .ini file where credentials are stored. + The Path can be changed if neccessary. + ''' + return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'CredentialsRegio.ini') + + +def manage_credentials(interactive): '''! Manages credentials for regionalstatistik.de (needed for dowload). A connfig file inside the epidata folder is either written (if not existent yet) @@ -109,25 +119,32 @@ def manage_credentials(): @return Username and password to sign in at regionalstatistik.de. ''' # path where ini file is found - path = os.path.join(os.path.dirname( - os.path.abspath(__file__)), 'CredentialsRegio.ini') + path = path_to_credential_file() + + gd.default_print( + 'Info', 'No passwaord and/or username for regionalstatistik.de provided. Try to read from .ini file.') # check if .ini file exists if not os.path.exists(path): - print('.ini file not found. Writing CredentialsRegio.ini...') - username = input( - "Please enter username for https://www.regionalstatistik.de/genesis/online\n") - password = getpass.getpass( - "Please enter password for https://www.regionalstatistik.de/genesis/online\n") - # create file - write_ini = gd.user_choice( - message='Do you want the credentials to be stored in an unencrypted .ini file?\n' + - 'The next time this function is called, the credentials can be read from that file.') - if write_ini: - string = '[CREDENTIALS]\nUsername = ' + \ - username+'\nPassword = '+password - with open(path, 'w+') as file: - file.write(string) + if interactive: + gd.default_print( + 'Info', '.ini file not found. Writing CredentialsRegio.ini...') + username = input( + "Please enter username for https://www.regionalstatistik.de/genesis/online\n") + password = getpass.getpass( + "Please enter password for https://www.regionalstatistik.de/genesis/online\n") + # create file + write_ini = gd.user_choice( + message='Do you want the credentials to be stored in an unencrypted .ini file?\n' + + 'The next time this function is called, the credentials can be read from that file.') + if write_ini: + string = '[CREDENTIALS]\nUsername = ' + \ + username+'\nPassword = '+password + with open(path, 'w+') as file: + file.write(string) + else: + raise gd.DataError( + 'No .ini file found. Cannot access regionalstatistik.de for downloading population data.') else: parser = configparser.ConfigParser() @@ -275,10 +292,10 @@ def assign_population_data(df_pop_raw, counties, age_cols, idCounty_idx): elif len(county_id) < 5: pass else: - print('no data for ' + df_pop_raw.loc - [start_idx, dd.EngEng['idCounty']]) raise gd.DataError( - 'Error. County ID in input population data ' + 'No data for ' + df_pop_raw.loc + [start_idx, dd.EngEng['idCounty']] + + 'County ID in input population data ' 'found which could not be assigned.') return df_pop @@ -307,10 +324,10 @@ def test_total_population(df_pop, age_cols): def get_population_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], merge_eisenach=True, username='', - password=''): + password='', + **kwargs): """! Download age-stratified population data for the German counties. The data we use is: @@ -338,8 +355,6 @@ def get_population_data(read_data=dd.defaultDict['read_data'], Default defined in defaultDict. @param out_folder Path to folder where data is written in folder out_folder/Germany. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is written or - not. Default defined in defaultDict. Currently not used. @param merge_eisenach [Default: True] or False. Defines whether the counties 'Wartburgkreis' and 'Eisenach' are listed separately or combined as one entity 'Wartburgkreis'. @@ -347,10 +362,13 @@ def get_population_data(read_data=dd.defaultDict['read_data'], @param password Password to sign in at regionalstatistik.de. @return DataFrame with adjusted population data for all ages to current level. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + # If no username or password is provided, the credentials are either read from an .ini file or, # if the file does not exist they have to be given as user input. if (username is None) or (password is None): - username, password = manage_credentials() + username, password = manage_credentials(conf.interactive) directory = os.path.join(out_folder, 'Germany') gd.check_dir(directory) diff --git a/pycode/memilio-epidata/memilio/epidata/getSimulationData.py b/pycode/memilio-epidata/memilio/epidata/getSimulationData.py index b3a49b8a2c..cb22dbe27f 100644 --- a/pycode/memilio-epidata/memilio/epidata/getSimulationData.py +++ b/pycode/memilio-epidata/memilio/epidata/getSimulationData.py @@ -37,24 +37,23 @@ def print_error(text): - print('Error: Something went wrong while getting ' + text + - ' data. This was likely caused by a changed file format' - ' of the source material. Please report this as an issue. ' + text + - ' data could not be stored correctly.') + gd.default_print('Error', 'Something went wrong while getting ' + text + + ' data. This was likely caused by a changed file format' + ' of the source material. Please report this as an issue. ' + text + + ' data could not be stored correctly.') def get_simulation_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot'], split_berlin=dd.defaultDict['split_berlin'], rep_date=dd.defaultDict['rep_date'], - sanitize_data=dd.defaultDict['sanitize_data'] + sanitize_data=dd.defaultDict['sanitize_data'], + **kwargs ): """! Downloads all data from external sources @@ -68,17 +67,19 @@ def get_simulation_data(read_data=dd.defaultDict['read_data'], @param read_data True or False. Defines if data is read from file or downloaded. Default defined in defaultDict. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default 2020-01-01. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates True or False. Defines if values for dates without new information are imputed. Default defined in defaultDict. @param moving_average Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out effects of irregular reporting. Default defined in defaultDict. - @param make_plot True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. @param split_berlin True or False. Defines if Berlin's disctricts are kept separated or get merged. Default defined in defaultDict. @param rep_date True or False. Defines if reporting date or reference date is taken into dataframe. Default defined in defaultDict. @param sanitize_data Value in {0,1,2,3}. Redistributes cases of every county either based on regions' ratios or on thresholds and population. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw + make_plot = conf.plot arg_dict_all = { "read_data": read_data, "file_format": file_format, @@ -99,25 +100,25 @@ def get_simulation_data(read_data=dd.defaultDict['read_data'], try: getCaseData.get_case_data(**arg_dict_cases) except Exception as exp: - print(str(type(exp).__name__) + ": " + str(exp)) + gd.default_print('Error', str(type(exp).__name__) + ": " + str(exp)) print_error('case') try: getPopulationData.get_population_data(**arg_dict_all) except Exception as exp: - print(str(type(exp).__name__) + ": " + str(exp)) + gd.default_print('Error', str(type(exp).__name__) + ": " + str(exp)) print_error('population') try: getDIVIData.get_divi_data(**arg_dict_divi) except Exception as exp: - print(str(type(exp).__name__) + ": " + str(exp)) + gd.default_print('Error', str(type(exp).__name__) + ": " + str(exp)) print_error('DIVI') try: getVaccinationData.get_vaccination_data(**arg_dict_vacc) except Exception as exp: - print(str(type(exp).__name__) + ": " + str(exp)) + gd.default_print('Error', str(type(exp).__name__) + ": " + str(exp)) print_error('vaccination') diff --git a/pycode/memilio-epidata/memilio/epidata/getTestingData.py b/pycode/memilio-epidata/memilio/epidata/getTestingData.py index 1eb4909175..d17067d889 100644 --- a/pycode/memilio-epidata/memilio/epidata/getTestingData.py +++ b/pycode/memilio-epidata/memilio/epidata/getTestingData.py @@ -117,12 +117,11 @@ def transform_weeks_to_dates(df_test): def get_testing_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot']): + **kwargs): """! Downloads the RKI testing data and provides positive rates of testing data in different ways. Since positive rates also implicitly provide information on testing numbers while the opposite is @@ -156,15 +155,17 @@ def get_testing_data(read_data=dd.defaultDict['read_data'], @param read_data True or False. Defines if data is read from file or downloaded. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default defined in defaultDict. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates True or False. Defines if values for dates without new information are imputed. Default defined in defaultDict. At the moment they are always imputed. @param moving_average Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out effects of irregular reporting. Default defined in defaultDict. - @param make_plot True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw + # data for all dates is automatically added impute_dates = True @@ -265,7 +266,7 @@ def get_testing_data(read_data=dd.defaultDict['read_data'], gd.write_dataframe(df_test[0], directory, filename, file_format) # plot country-wide positive rates - if make_plot: + if conf.plot: # make plot customPlot.plot_multiple_series( df_test[0][dd.EngEng['date']], @@ -287,7 +288,7 @@ def get_testing_data(read_data=dd.defaultDict['read_data'], gd.write_dataframe(df_test[1], directory, filename, file_format) # plot positive rates of federal states - if make_plot: + if conf.plot: # make plot customPlot.plot_multiple_series( df_test[0][dd.EngEng['date']], diff --git a/pycode/memilio-epidata/memilio/epidata/getVaccinationData.py b/pycode/memilio-epidata/memilio/epidata/getVaccinationData.py index f622114de5..8299194ebe 100644 --- a/pycode/memilio-epidata/memilio/epidata/getVaccinationData.py +++ b/pycode/memilio-epidata/memilio/epidata/getVaccinationData.py @@ -37,12 +37,12 @@ pd.options.mode.copy_on_write = True -def download_vaccination_data(read_data, filename, directory): +def download_vaccination_data(read_data, filename, directory, interactive): url = "https://raw.githubusercontent.com/robert-koch-institut/COVID-19-Impfungen_in_Deutschland/master/Deutschland_Landkreise_COVID-19-Impfungen.csv" path = os.path.join(directory + filename + ".json") df_data = gd.get_file(path, url, read_data, param_dict={'dtype': { - 'LandkreisId_Impfort': "string", 'Altersgruppe': "string", 'Impfschutz': int, 'Anzahl': int}}, interactive=True) + 'LandkreisId_Impfort': "string", 'Altersgruppe': "string", 'Impfschutz': int, 'Anzahl': int}}, interactive=interactive) return df_data @@ -367,7 +367,8 @@ def sanitizing_extrapolation_mobility( b = df[(df.ID_County == id) & ( df.Age_RKI == age)].loc[:, column_names].iloc[-1] if sum(a-b) > 1e-8: - print("Error in: " + str(id) + " " + str(age)) + gd.default_print( + "Error", "Cumulative sum error in: " + str(id) + " " + str(age)) ### end of to be removed ### return df @@ -449,8 +450,8 @@ def extrapolate_age_groups_vaccinations( # test if number of vaccinations in current county are equal in old and new dataframe for random chosen date for vacc in column_names: if total_county_df[total_county_df[dd.EngEng['date']] == '2022-05-10'][vacc].sum() - vacc_df[vacc_df[dd.EngEng['date']] == '2022-05-10'][vacc].sum() > 1e-5: - print( - "Error in transformation...") + gd.default_print("Error", + "Error in transformation...") # merge all county specific dataframes df_data_ageinf_county_cs = pd.concat(df_data_ageinf_county_cs) @@ -464,13 +465,12 @@ def extrapolate_age_groups_vaccinations( def get_vaccination_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], - no_raw=dd.defaultDict['no_raw'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], impute_dates=True, moving_average=dd.defaultDict['moving_average'], - make_plot=dd.defaultDict['make_plot'], - sanitize_data=dd.defaultDict['sanitize_data'] + sanitize_data=dd.defaultDict['sanitize_data'], + **kwargs ): """! Downloads the RKI vaccination data and provides different kind of structured data. @@ -498,14 +498,12 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], Here Data is always downloaded from the internet. @param file_format File format which is used for writing the data. Default defined in defaultDict. @param out_folder Folder where data is written to. Default defined in defaultDict. - @param no_raw True or False. Defines if unchanged raw data is saved or not. Default defined in defaultDict. @param start_date Date of first date in dataframe. Default defined in defaultDict. @param end_date Date of last date in dataframe. Default defined in defaultDict. @param impute_dates True or False. Defines if values for dates without new information are imputed. Here Dates are always imputed so False changes nothing. @param moving_average Integers >=0. Applies an 'moving_average'-days moving average on all time series to smooth out effects of irregular reporting. Default defined in defaultDict. - @param make_plot True or False. Defines if plots are generated with matplotlib. Default defined in defaultDict. @param sanitize_data Value in {0,1,2,3}; Default: 1. For many counties, vaccination data is not correctly attributed to home locations of vaccinated persons. If 'sanitize_data' is set to larger 0, this is @@ -521,9 +519,14 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], average on the corresponding vaccination ratios on county and federal state level. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use + no_raw = conf.no_raw + # data for all dates is automatically added if impute_dates == False: - print('Setting impute_dates = True as data for all dates is automatically added.') + gd.default_print( + 'Warning', 'Setting impute_dates = True as data for all dates is automatically added.') impute_dates = True directory = os.path.join(out_folder, 'Germany/') @@ -531,7 +534,11 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], filename = "RKIVaccFull" - df_data = download_vaccination_data(read_data, filename, directory) + df_data = download_vaccination_data( + read_data, filename, directory, conf.interactive) + + if conf.checks: + sanity_checks(df_data) if not no_raw: gd.write_dataframe(df_data, directory, filename, "json") @@ -575,9 +582,8 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], df_data[dd.EngEng['idCounty'] ] = df_data[dd.EngEng['idCounty']].astype(int) except ValueError: - print('Data items in ID_County could not be converted to integer. ' - 'Imputation and/or moving_average computation will FAIL.') - raise + gd.default_print("Error", 'Data items in ID_County could not be converted to integer. ' + 'Imputation and/or moving_average computation will FAIL.') # NOTE: the RKI vaccination table contains about # 180k 'complete' vaccinations in id 17000 Bundesressorts, which @@ -604,8 +610,8 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], min_age_old.append(int(age.split('+')[0])) else: extrapolate_agegroups = False - print("Error in provided age groups from vaccination data; " - "can not extrapolate to infection number age groups.") + gd.default_print( + "Error", "can not extrapolate provided age groups from vaccination data to infection number age groups.") min_age_old.append(max_age_all) # get population data for all countys (TODO: better to provide a corresponding method for the following lines in getPopulationData itself) @@ -614,7 +620,8 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], directory + "county_current_population.json") # pandas>1.5 raise FileNotFoundError instead of ValueError except (ValueError, FileNotFoundError): - print("Population data was not found. Download it from the internet.") + gd.default_print( + "Info", "Population data was not found. Download it from the internet.") population = gpd.get_population_data( read_data=False, file_format=file_format, out_folder=out_folder, no_raw=no_raw, merge_eisenach=True) @@ -632,8 +639,8 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], min_age_pop.append(0) else: extrapolate_agegroups = False - print("Error in provided age groups from population data;" - " can not extrapolate to infection number age groups.") + gd.default_print( + "Error", "can not extrapolate provided age groups from vaccination data to infection number age groups.") min_age_pop.append(max_age_all) # new age groups, here taken from definition of RKI infection data @@ -710,7 +717,7 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], population[unique_age_groups_pop].sum(axis=1) - population_all_ages[[str(i) for i in min_all_ages]].sum( axis=1))) > 1e-8: - print("ERROR") + gd.default_print("Error", "Population does not match expectations") population_old_ages = pd.DataFrame(population[dd.EngEng['idCounty']]) for i in range(len(age_old_to_all_ages_indices)): @@ -813,11 +820,13 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], if sanitize_data == 1 or sanitize_data == 2: if sanitize_data == 1: - print('Sanitizing activated: Using federal state average values.') + gd.default_print( + 'Info', 'Sanitizing activated: Using federal state average values.') to_county_map = geoger.get_stateid_to_countyids_map( merge_eisenach=True) elif sanitize_data == 2: - print('Sanitizing activated: Using intermediate region average values.') + gd.default_print( + 'Info', 'Sanitizing activated: Using intermediate region average values.') to_county_map = geoger.get_intermediateregionid_to_countyids_map( merge_eisenach=True) @@ -826,8 +835,8 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], unique_age_groups_old, vacc_column_names, population_old_ages) elif sanitize_data == 3: - print( - 'Sanitizing activated: Using mobility-based vaccination redistribution approach.') + gd.default_print('Info', + 'Sanitizing activated: Using mobility-based vaccination redistribution approach.') # get neighbors based on mobility pattern and store # commuter inflow from other counties as first weight to distribute # vaccinations from vaccination county to extrapolated home counties @@ -846,9 +855,9 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], min_date=start_date, max_date=end_date) else: spinner.stop() - print('Sanitizing deactivated.') + gd.default_print('Info', 'Sanitizing deactivated.') - if make_plot: + if conf.plot: # have a look extrapolated vaccination ratios (TODO: create plotting for production) # aggregate total number of vaccinations per county and age group latest_date = df_data_agevacc_county_cs[dd.EngEng["date"]][len( @@ -880,7 +889,7 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], directory, filename, file_format) # make plot of absolute numbers original age resolution - if make_plot: + if conf.plot: # extract (dummy) date column to plt date_vals = df_data_agevacc_county_cs.loc[ (df_data_agevacc_county_cs[dd.EngEng['ageRKI']] == @@ -971,7 +980,7 @@ def get_vaccination_data(read_data=dd.defaultDict['read_data'], directory, filename, file_format) # make plot of relative numbers of original and extrapolated age resolution - if make_plot: + if conf.plot: # merge Eisenach... population_new_ages = geoger.merge_df_counties_all( population_new_ages, sorting=[dd.EngEng["idCounty"]], diff --git a/pycode/memilio-epidata/memilio/epidata/modifyDataframeSeries.py b/pycode/memilio-epidata/memilio/epidata/modifyDataframeSeries.py index d84fc3ddf3..c34564fec4 100644 --- a/pycode/memilio-epidata/memilio/epidata/modifyDataframeSeries.py +++ b/pycode/memilio-epidata/memilio/epidata/modifyDataframeSeries.py @@ -29,6 +29,7 @@ import pandas as pd from memilio.epidata import defaultDict as dd +from memilio.epidata import getDataIntoPandasDataFrame as gd # activate CoW for more predictable behaviour of pandas DataFrames pd.options.mode.copy_on_write = True @@ -183,7 +184,7 @@ def impute_and_reduce_df( # at all (e.g., many counties do not have had any kind of # refreshing vaccinations so far.) Then, the following warning # is misleading. - # print('Warning: Tuple ' + str(ids) + ' not found in local data frame. Imputing zeros.') + # gd.deafult_print('Warning', 'Tuple ' + str(ids) + ' not found in local data frame. Imputing zeros.') # create zero values for non-existent time series values = {} counter = 0 @@ -330,8 +331,8 @@ def create_intervals_mapping(from_lower_bounds, to_lower_bounds): """ if (from_lower_bounds[0] != to_lower_bounds[0] or from_lower_bounds[-1] != to_lower_bounds[-1]): - print("Range of intervals mapped from is different than range of " + - "intervals mapped to. Therefore, empty entries are possible.") + gd.default_print("Warning", "Range of intervals mapped from is different than range of " + + "intervals mapped to. Therefore, empty entries are possible.") extended_begin = False extended_end = False @@ -451,8 +452,8 @@ def fit_age_group_intervals( else: raise ValueError("Undefined entry for one age group in age_out") if min_entry_out < min_entry_in or max_entry_out > max_entry_in: - print( - "Data from input data frame does not fit to desired output. Required data that is missing is interpreted as zero.") + gd.default_print("Warning", + "Data from input data frame does not fit to desired output. Required data that is missing is interpreted as zero.") if max_entry_in not in age_out_min: age_out_min.append(max_entry_in) @@ -497,8 +498,8 @@ def fit_age_group_intervals( raise ValueError( "Undefined entry for one age group in population data") if min_entry_out < min_entry_in or max_entry_out > max_entry_in: - print( - "Data from input data frame does not fit to population data. Required data that is missing is interpreted as zero.") + gd.default_print("Warning", + "Data from input data frame does not fit to population data. Required data that is missing is interpreted as zero.") if max_entry_in not in age_pop_min: age_pop_min.append(max_entry_in) diff --git a/pycode/memilio-epidata/memilio/epidata/transformWeatherData.py b/pycode/memilio-epidata/memilio/epidata/transformWeatherData.py index 0fe5962382..1478ce1e0a 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformWeatherData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformWeatherData.py @@ -38,10 +38,10 @@ def transformWeatherData(read_data=dd.defaultDict['read_data'], out_folder=dd.defaultDict['out_folder'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], - make_plot=dd.defaultDict['make_plot'], moving_average=dd.defaultDict['moving_average'], merge_berlin=True, - merge_eisenach=False + merge_eisenach=False, + **kwargs ): """! ... @param file_format File format which is used for writing the data. @@ -52,11 +52,11 @@ def transformWeatherData(read_data=dd.defaultDict['read_data'], of stored data frames. @param end_date [Default = '', taken from read data] End date of stored data frames. - @param make_plot False [Default] or True. Defines if plots are - generated with matplotlib. @param moving_average 0 [Default] or Number > 0. Defines the number of days for which a centered moving average is computed. """ + conf = gd.Conf(out_folder, **kwargs) + out_folder = conf.path_to_use directory = out_folder directory = os.path.join(directory, 'Germany/') @@ -180,8 +180,8 @@ def transformWeatherData(read_data=dd.defaultDict['read_data'], except KeyError: pass - print( - "Time needed: " + str(time.perf_counter()-start_time) + " sec") + gd.default_print("Info", + "Time needed: " + str(time.perf_counter()-start_time) + " sec") #### start validation #### @@ -198,7 +198,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transformWeatherData(read_data=False, make_plot=True, moving_average=30) + transformWeatherData(read_data=False, moving_average=30) if __name__ == "__main__": diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_cleandata.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_cleandata.py index 9027c13e21..5cdf3f697e 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_cleandata.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_cleandata.py @@ -911,8 +911,6 @@ def test_cli_default(self): [all_data, cases, jh, popul, divi, vacc, commuter, testing, hospitalization, json, hdf5, txt, out_path] = cd.cli() - print([all_data, cases, jh, popul, hdf5, out_path]) - self.assertEqual(all_data, False) self.assertEqual(cases, False) self.assertEqual(jh, False) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_customPlot.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_customPlot.py index 3df996358f..e9de23fc1d 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_customPlot.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_customPlot.py @@ -17,7 +17,6 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################# -import os import unittest from unittest.mock import MagicMock, patch diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_geoModificationGermany.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_geoModificationGermany.py index ab58cfd75b..93dc4c2210 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_geoModificationGermany.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_geoModificationGermany.py @@ -24,11 +24,14 @@ from pyfakefs import fake_filesystem_unittest from memilio.epidata import geoModificationGermany as geoger -from memilio.epidata import progress_indicator +from memilio.epidata import getDataIntoPandasDataFrame as gd class Test_geoModificationGermany(fake_filesystem_unittest.TestCase): + # set verbosity level to Debug to check prints + gd.Conf.v_level = 'Debug' + list_int_state_ids = [i+1 for i in range(16)] list_str_state_ids = [str(i+1).zfill(2) for i in range(16)] list_str_states = [ @@ -181,7 +184,6 @@ class Test_geoModificationGermany(fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - progress_indicator.ProgressIndicator.disable_indicators(True) def test_get_state_IDs(self): # zfill is false @@ -313,7 +315,7 @@ def test_check_for_all_counties(self, mock_print): self.assertFalse(geoger.check_for_all_counties( unique_county_list, False, False)) mock_print.assert_called_with( - 'Downloaded data is not complete. Missing 412 counties.') + 'Error: Downloaded data is not complete. Missing 412 counties.') # check with more counties unique_county_list = geoger.get_county_ids(False, False, False) @@ -322,24 +324,26 @@ def test_check_for_all_counties(self, mock_print): self.assertTrue(geoger.check_for_all_counties( unique_county_list, False, False)) mock_print.assert_called_with( - 'Source data frame contains more counties than official ' + 'Warning: Source data frame contains ' + + str(len(testlist)) + ' more counties than official ' 'county list. This could be OK, please verify yourself.') - # check without some counries + # check without some countries unique_county_list = geoger.get_county_ids(False, False, False) testlist = (1001, 3456, 10041) for i in range(0, len(testlist)): unique_county_list.remove(testlist[i]) self.assertFalse(geoger.check_for_all_counties( unique_county_list, False, False)) - mock_print.assert_called_with('Missing counties: [3456, 10041, 1001]') + mock_print.assert_called_with( + 'Info: Missing counties: [3456, 10041, 1001]') # check without merged counties unique_county_list = geoger.get_county_ids(True, True, False) self.assertFalse(geoger.check_for_all_counties( unique_county_list, False, False)) mock_print.assert_called_with( - 'Downloaded data is not complete. Missing 12 counties.') + 'Error: Downloaded data is not complete. Missing 12 counties.') def test_get_countyid_to_stateid_map(self): diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseData.py index 36d3c47e4c..27a23e1253 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseData.py @@ -29,7 +29,6 @@ from memilio.epidata import defaultDict as dd from memilio.epidata import getCaseData as gcd from memilio.epidata import getDataIntoPandasDataFrame as gd -from memilio.epidata import progress_indicator class TestGetCaseData(fake_filesystem_unittest.TestCase): @@ -138,7 +137,6 @@ class TestGetCaseData(fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - progress_indicator.ProgressIndicator.disable_indicators(True) def write_case_data(self, out_folder): # write dataset for reading data @@ -713,7 +711,7 @@ def test_no_raw(self, mock_file): def test_check_for_completeness(self): empty_df = pd.DataFrame() - self.assertEqual(gcd.check_for_completeness(empty_df), False) + self.assertEqual(gcd.check_for_completeness(empty_df, True), False) @patch('memilio.epidata.getDataIntoPandasDataFrame.get_file') def test_rep_date(self, mock_file): diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseDatawithEstimations.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseDatawithEstimations.py index 5514ca1e91..d9d50fff7f 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseDatawithEstimations.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseDatawithEstimations.py @@ -33,6 +33,7 @@ class TestGetCaseDatawithEstimations(fake_filesystem_unittest.TestCase): path = '/home/CaseEstimationData' + gd.Conf.v_level = 'Debug' # Notice data is not realistic str_whole_country_Germany_jh = \ @@ -563,7 +564,7 @@ def test_except_non_existing_file(self, mock_print): read_data=read_data, file_format=file_format, out_folder=out_folder, no_raw=no_raw, impute_dates=impute_dates, moving_average=moving_average, make_plot=make_plot, - split_berlin=split_berlin, rep_date=rep_date) + split_berlin=split_berlin, rep_date=rep_date, verbosity_level='Debug') # print is called 9 times, because no file exists self.assertEqual(len(mock_print.mock_calls), 9) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCommuterMobility.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCommuterMobility.py index 95ea3124f5..89cc67392a 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCommuterMobility.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCommuterMobility.py @@ -28,7 +28,6 @@ from memilio.epidata import geoModificationGermany as geoger from memilio.epidata import getCommuterMobility as gcm -from memilio.epidata import progress_indicator class TestCommuterMigration(fake_filesystem_unittest.TestCase): @@ -71,13 +70,12 @@ class TestCommuterMigration(fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - progress_indicator.ProgressIndicator.disable_indicators(True) @patch('builtins.print') def test_verify_sorted(self, mock_print): self.assertEqual(True, gcm.verify_sorted(self.test_countykey_list)) self.assertEqual(False, gcm.verify_sorted(self.test_countykey_list2)) - Errorcall = ('Error. Input list not sorted.') + Errorcall = ('Error: Input list not sorted.') mock_print.assert_called_with(Errorcall) @patch('builtins.print') @@ -85,7 +83,7 @@ def test_assign_geographical_entities(self, mock_print): ( countykey2govkey, countykey2localnumlist, gov_county_table, state_gov_table) = gcm.assign_geographical_entities( - self.countykey_list, self.govkey_list) + self.countykey_list, self.govkey_list, True) for item in self.test_countykey2govkey.keys(): self.assertEqual( self.test_countykey2govkey.get(item), @@ -106,7 +104,7 @@ def test_assign_geographical_entities(self, mock_print): # test case with not matching countykey and govkey lists (countykey2govkey, countykey2localnumlist, gov_county_table, state_gov_table) = gcm.assign_geographical_entities( - self.test_countykey_list, self.test_govkey_list) + self.test_countykey_list, self.test_govkey_list, True) self.assertEqual(countykey2govkey, collections.OrderedDict()) self.assertEqual(countykey2localnumlist, collections.OrderedDict()) self.assertEqual(gov_county_table, [ @@ -115,18 +113,18 @@ def test_assign_geographical_entities(self, mock_print): # test case with different number of data gcm.assign_geographical_entities( - self.test_countykey_list, self.govkey_list) - Errorcall = ('Error. Number of government regions wrong.') + self.test_countykey_list, self.govkey_list, True) + Errorcall = ('Error: Number of government regions wrong.') mock_print.assert_called_with(Errorcall) @patch('memilio.epidata.getPopulationData.get_population_data', return_value=df_pop) - @patch('builtins.input', return_value='y') + @patch('memilio.epidata.getDataIntoPandasDataFrame.user_choice', return_value=True) def test_commuter_data(self, mock_input, mock_popul): """! Tests migration data by some randomly chosen tests. """ df_commuter_migration = gcm.get_commuter_data( - out_folder=self.path, ref_year=2022) + out_folder=self.path, ref_year=2022, interactive=True) # just do some tests on randomly chosen migrations @@ -153,7 +151,7 @@ def test_commuter_data(self, mock_input, mock_popul): self.assertEqual(df_commuter_migration.loc[city_from, city_to], 29) @patch('memilio.epidata.getPopulationData.get_population_data', return_value=df_pop) - @patch('builtins.input', return_value='y') + @patch('memilio.epidata.getDataIntoPandasDataFrame.user_choice', return_value=True) @patch('builtins.print') def test_get_neighbors_mobility(self, mock_print, mock_input, mock_popul): @@ -161,7 +159,7 @@ def test_get_neighbors_mobility(self, mock_print, mock_input, mock_popul): # direction = both (countykey_list, commuter_all) = gcm.get_neighbors_mobility( testcountyid, direction='both', abs_tol=0, rel_tol=0, - tol_comb='or', out_folder=self.path) + tol_comb='or', out_folder=self.path, interactive=True) self.assertEqual(len(countykey_list), 398) self.assertEqual(271, commuter_all[0]) self.assertEqual(2234, commuter_all[9]) @@ -171,7 +169,7 @@ def test_get_neighbors_mobility(self, mock_print, mock_input, mock_popul): # direction = in (countykey_list, commuter_all) = gcm.get_neighbors_mobility( testcountyid, direction='in', abs_tol=0, rel_tol=0, - tol_comb='or', out_folder=self.path) + tol_comb='or', out_folder=self.path, interactive=True) self.assertEqual(len(countykey_list), 393) self.assertEqual(70, commuter_all[0]) self.assertEqual(892, commuter_all[9]) @@ -180,7 +178,7 @@ def test_get_neighbors_mobility(self, mock_print, mock_input, mock_popul): # direction = out (countykey_list, commuter_all) = gcm.get_neighbors_mobility( testcountyid, direction='out', abs_tol=0, rel_tol=0, - tol_comb='or', out_folder=self.path) + tol_comb='or', out_folder=self.path, interactive=True) self.assertEqual(len(countykey_list), 378) self.assertEqual(201, commuter_all[0]) self.assertEqual(1342, commuter_all[9]) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py index 52d119f20b..532809b382 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py @@ -78,68 +78,54 @@ def test_cli_correct_default(self): read_data = arg_dict["read_data"] file_format = arg_dict["file_format"] out_folder = arg_dict["out_folder"] - no_raw = arg_dict["no_raw"] assert read_data == dd.defaultDict['read_data'] assert file_format == dd.defaultDict['file_format'] assert out_folder == out_path_default - assert no_raw == dd.defaultDict['no_raw'] arg_dict = gd.cli("jh") read_data = arg_dict["read_data"] file_format = arg_dict["file_format"] out_folder = arg_dict["out_folder"] - no_raw = arg_dict["no_raw"] assert read_data == dd.defaultDict['read_data'] assert file_format == dd.defaultDict['file_format'] assert out_folder == out_path_default - assert no_raw == dd.defaultDict['no_raw'] arg_dict = gd.cli("cases") read_data = arg_dict["read_data"] file_format = arg_dict["file_format"] out_folder = arg_dict["out_folder"] impute_dates = arg_dict["impute_dates"] - make_plot = arg_dict["make_plot"] moving_average = arg_dict["moving_average"] split_berlin = arg_dict["split_berlin"] - no_raw = arg_dict["no_raw"] rep_date = arg_dict["rep_date"] assert read_data == dd.defaultDict['read_data'] assert file_format == dd.defaultDict['file_format'] assert out_folder == out_path_default - assert make_plot == dd.defaultDict['make_plot'] assert split_berlin == dd.defaultDict['split_berlin'] assert moving_average == dd.defaultDict['moving_average'] assert impute_dates == dd.defaultDict['impute_dates'] - assert no_raw == dd.defaultDict['no_raw'] assert rep_date == dd.defaultDict['rep_date'] arg_dict = gd.cli("cases_est") read_data = arg_dict["read_data"] file_format = arg_dict["file_format"] - make_plot = arg_dict["make_plot"] out_folder = arg_dict["out_folder"] - no_raw = arg_dict["no_raw"] assert read_data == dd.defaultDict['read_data'] assert file_format == dd.defaultDict['file_format'] assert out_folder == out_path_default - assert no_raw == dd.defaultDict['no_raw'] - assert make_plot == dd.defaultDict['make_plot'] arg_dict = gd.cli("commuter_official") read_data = arg_dict["read_data"] file_format = arg_dict["file_format"] out_folder = arg_dict["out_folder"] - no_raw = arg_dict["no_raw"] assert read_data == dd.defaultDict['read_data'] assert file_format == dd.defaultDict['file_format'] assert out_folder == out_path_default - assert no_raw == dd.defaultDict['no_raw'] arg_dict = gd.cli("divi") read_data = arg_dict["read_data"] @@ -149,7 +135,6 @@ def test_cli_correct_default(self): start_date = arg_dict["start_date"] impute_dates = arg_dict["impute_dates"] moving_average = arg_dict["moving_average"] - no_raw = arg_dict["no_raw"] assert read_data == dd.defaultDict['read_data'] assert file_format == dd.defaultDict['file_format'] @@ -158,29 +143,22 @@ def test_cli_correct_default(self): assert start_date == date(2020, 4, 24) assert impute_dates == dd.defaultDict['impute_dates'] assert moving_average == dd.defaultDict['moving_average'] - assert no_raw == dd.defaultDict['no_raw'] arg_dict = gd.cli("sim") - [read_data, file_format, out_folder, end_date, make_plot, impute_dates, - moving_average, split_berlin, start_date, no_raw] read_data = arg_dict["read_data"] file_format = arg_dict["file_format"] out_folder = arg_dict["out_folder"] end_date = arg_dict["end_date"] - make_plot = arg_dict["make_plot"] start_date = arg_dict["start_date"] impute_dates = arg_dict["impute_dates"] moving_average = arg_dict["moving_average"] split_berlin = arg_dict["split_berlin"] - no_raw = arg_dict["no_raw"] assert read_data == dd.defaultDict['read_data'] assert file_format == dd.defaultDict['file_format'] assert out_folder == out_path_default - assert no_raw == dd.defaultDict['no_raw'] assert end_date == dd.defaultDict['end_date'] assert impute_dates == dd.defaultDict['impute_dates'] - assert make_plot == dd.defaultDict['make_plot'] assert moving_average == dd.defaultDict['moving_average'] assert split_berlin == dd.defaultDict['split_berlin'] assert start_date == dd.defaultDict['start_date'] @@ -270,21 +248,19 @@ def test_cli_set_different_values(self): assert no_raw == True test_args = ["prog", '--read-data', '--out-folder', folder, - '--file-format', 'hdf5', '--make-plot', '--split-berlin', - '--moving-average', 0, '--no-raw', '--impute-dates'] + '--file-format', 'hdf5', '--split-berlin', + '--moving-average', 0, '--impute-dates'] with patch.object(sys, 'argv', test_args): arg_dict = gd.cli("cases") - [read_data, file_format, out_folder, impute_dates, make_plot, - moving_average, split_berlin, no_raw, rep_date] = [ + [read_data, file_format, out_folder, impute_dates, + moving_average, split_berlin, rep_date] = [ arg_dict["read_data"], arg_dict["file_format"], arg_dict["out_folder"], arg_dict["impute_dates"], - arg_dict["make_plot"], arg_dict["moving_average"], arg_dict["split_berlin"], - arg_dict["no_raw"], arg_dict["rep_date"]] assert read_data == True @@ -293,65 +269,55 @@ def test_cli_set_different_values(self): assert impute_dates == True assert split_berlin == True assert moving_average == 0 - assert make_plot == True - assert no_raw == True assert rep_date == False test_args = ["prog", '--read-data', '--out-folder', - folder, '--file-format', 'json', '--make-plot'] + folder, '--file-format', 'json'] with patch.object(sys, 'argv', test_args): arg_dict = gd.cli("cases_est") - [read_data, file_format, out_folder, no_raw, make_plot] = [ + [read_data, file_format, out_folder] = [ arg_dict["read_data"], arg_dict["file_format"], - arg_dict["out_folder"], - arg_dict["no_raw"], - arg_dict["make_plot"]] + arg_dict["out_folder"]] assert read_data == True assert file_format == 'json' assert out_folder == "some_folder" - assert make_plot == True - assert no_raw == False test_args = [ "prog", '--out-folder', folder, '--file-format', 'json', - '--start-date', '2020-11-24', '--end-date', '2020-11-26', '-n'] + '--start-date', '2020-11-24', '--end-date', '2020-11-26'] with patch.object(sys, 'argv', test_args): arg_dict = gd.cli("divi") - [read_data, file_format, out_folder, end_date, start_date, - no_raw] = [arg_dict["read_data"], - arg_dict["file_format"], - arg_dict["out_folder"], - arg_dict["end_date"], - arg_dict["start_date"], - arg_dict["no_raw"]] + [read_data, file_format, out_folder, end_date, start_date] = [ + arg_dict["read_data"], + arg_dict["file_format"], + arg_dict["out_folder"], + arg_dict["end_date"], + arg_dict["start_date"]] assert read_data == dd.defaultDict['read_data'] assert file_format == 'json' assert out_folder == "some_folder" assert end_date == date(2020, 11, 26) assert start_date == date(2020, 11, 24) - assert no_raw == True test_args = [ "prog", '--out-folder', folder, '--file-format', 'json', - '--make-plot', '--start-date', '2020-11-24', '--end-date', + '--start-date', '2020-11-24', '--end-date', '2020-11-26'] with patch.object(sys, 'argv', test_args): arg_dict = gd.cli("sim") - [read_data, file_format, out_folder, no_raw, end_date, - impute_dates, make_plot, moving_average, split_berlin, + [read_data, file_format, out_folder, end_date, + impute_dates, moving_average, split_berlin, start_date] = [arg_dict["read_data"], arg_dict["file_format"], arg_dict["out_folder"], - arg_dict["no_raw"], arg_dict["end_date"], arg_dict["impute_dates"], - arg_dict["make_plot"], arg_dict["moving_average"], arg_dict["split_berlin"], arg_dict["start_date"]] @@ -361,11 +327,9 @@ def test_cli_set_different_values(self): assert out_folder == "some_folder" assert end_date == date(2020, 11, 26) assert start_date == date(2020, 11, 24) - assert make_plot == True assert split_berlin == dd.defaultDict['split_berlin'] assert moving_average == dd.defaultDict['moving_average'] assert impute_dates == dd.defaultDict['impute_dates'] - assert no_raw == False def test_append_filename(self): test_moving_average = 2 @@ -490,15 +454,13 @@ def test_call_functions( arg_dict_all = { "read_data": dd.defaultDict['read_data'], "file_format": dd.defaultDict['file_format'], - "out_folder": os.path.join(dd.defaultDict['out_folder']), - 'no_raw': dd.defaultDict["no_raw"]} + "out_folder": os.path.join(dd.defaultDict['out_folder'])} arg_dict_data_download = { "start_date": dd.defaultDict['start_date'], "end_date": dd.defaultDict['end_date'], "impute_dates": dd.defaultDict['impute_dates'], - "moving_average": dd.defaultDict['moving_average'], - "make_plot": dd.defaultDict['make_plot']} + "moving_average": dd.defaultDict['moving_average']} arg_dict_cases = { **arg_dict_all, **arg_dict_data_download, diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getHospitalizationData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getHospitalizationData.py index 8ec7ebb6fb..0bf01bafa7 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getHospitalizationData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getHospitalizationData.py @@ -28,7 +28,6 @@ from memilio.epidata import getDataIntoPandasDataFrame as gd from memilio.epidata import getHospitalizationData as ghd -from memilio.epidata import progress_indicator class TestGetHospitalizationData(fake_filesystem_unittest.TestCase): @@ -74,7 +73,6 @@ class TestGetHospitalizationData(fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - progress_indicator.ProgressIndicator.disable_indicators(True) @patch('builtins.print') def test_divi_data_hospit_sanity_checks(self, mock_print): @@ -96,6 +94,7 @@ def test_divi_data_hospit_sanity_checks(self, mock_print): 'a': [1, 2, 3], '7T_Hospitalisierung_Inzidenz': [4, 5, 6], '7T_Hospitalisierung_Faelle': [100, 1001, 100]}) + gd.Conf.v_level = 'Warning' ghd.hospit_sanity_checks(df) expected_print = [call("Warning: Number of data categories changed.")] mock_print.assert_has_calls(expected_print) @@ -114,12 +113,12 @@ def test_divi_data_hospit_sanity_checks(self, mock_print): error_message = "Error: Data categories have changed." self.assertEqual(str(error.exception), error_message) - @patch('builtins.input', return_value='Y') + @patch('memilio.epidata.getDataIntoPandasDataFrame.user_choice', return_value=True) @patch('memilio.epidata.getHospitalizationData.pd.read_csv', return_value=df_test) def test_get_hospitalization_data(self, mock_file, mock_in): # this should not raise any errors - ghd.get_hospitalization_data(out_folder=self.path) + ghd.get_hospitalization_data(out_folder=self.path, interactive=True) # check if all files are written self.assertEqual( diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index ef6d785d8d..b6ea20bed2 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -24,7 +24,6 @@ import os import pandas as pd import numpy as np -import matplotlib.pyplot as plt from datetime import date diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_divi_data.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_divi_data.py index 3738918a62..1574fb4107 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_divi_data.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_divi_data.py @@ -52,15 +52,15 @@ def setUp(self): def gdd_calls(self, text=''): directory = os.path.join(self.path, 'Germany/') gdd_calls = [ - call('Information: Data has been written to', + call('Info: Data has been written to ' + os.path.join(directory, 'FullData_DIVI.json')), - call('Information: Data has been written to', + call('Info: Data has been written to ' + os.path.join(directory, 'county_divi'+text+'.json')), call( - 'Information: Data has been written to', + 'Info: Data has been written to ' + os.path.join(directory, 'state_divi'+text+'.json')), call( - 'Information: Data has been written to', + 'Info: Data has been written to ' + os.path.join(directory, 'germany_divi'+text+'.json'))] return gdd_calls @@ -70,7 +70,8 @@ def gdd_calls(self, text=''): def test_get_divi_data_prints(self, mock_print, mock_file, mock_san): mock_file.return_value = self.df_test # case with start_date before 2020-04-24 - gdd.get_divi_data(out_folder=self.path, start_date=date(2020, 1, 1)) + gdd.get_divi_data(out_folder=self.path, start_date=date( + 2020, 1, 1), verbosity_level='Info') expected_call = [ call( 'Warning: First data available on 2020-04-24. You asked for 2020-01-01. Changed it to 2020-04-24.')] diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_jh_data.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_jh_data.py index cd9c360f74..a7c386df99 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_jh_data.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_jh_data.py @@ -85,7 +85,7 @@ def write_jh_data(self, out_folder): with open(file_jh_with_path, 'w') as f: f.write(self.str_FullData_JohnHopkins) - @patch('builtins.input', return_value='n') + @patch('memilio.epidata.getDataIntoPandasDataFrame.user_choice', return_value=False) def test_get_JH_Data(self, mockin): # Test without downloading data [read_data, file_format, out_folder, no_raw] \ diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py index 38e6526dce..1e080b530a 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py @@ -27,7 +27,6 @@ from pyfakefs import fake_filesystem_unittest from memilio.epidata import getPopulationData as gpd -from memilio.epidata import progress_indicator class Test_getPopulationData(fake_filesystem_unittest.TestCase): @@ -51,7 +50,6 @@ class Test_getPopulationData(fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - progress_indicator.ProgressIndicator.disable_indicators(True) def test_export_population_data(self): @@ -89,14 +87,14 @@ def test_get_population_data_full(self, mock_test, mock_export, mock_download): @patch('builtins.input', return_value=test_username) @patch('getpass.getpass', return_value=test_password) @patch('memilio.epidata.getDataIntoPandasDataFrame.user_choice', return_value=True) - @patch('os.path.abspath', return_value='') + @patch('memilio.epidata.getPopulationData.path_to_credential_file', return_value='./CredentialsRegio.ini') @patch('memilio.epidata.getPopulationData.read_population_data', return_value=df_pop_raw) @patch('memilio.epidata.getPopulationData.assign_population_data', return_value=df_pop) @patch('memilio.epidata.getPopulationData.test_total_population') def test_config_write(self, mock_test, mock_export, mock_raw, mock_path, mock_choice, mock_pw, mock_un): # username and password should be written into the config file. # The download and assigning to counties of the population data is mocked. - gpd.get_population_data(username=None, password=None) + gpd.get_population_data(username=None, password=None, interactive=True) # Check if the file is written. self.assertTrue(self.config_file_name in os.listdir(os.getcwd())) # Check content of the file. @@ -107,7 +105,7 @@ def test_config_write(self, mock_test, mock_export, mock_raw, mock_path, mock_ch self.assertEqual(parser['CREDENTIALS']['Username'], self.test_username) self.assertEqual(parser['CREDENTIALS']['Password'], self.test_password) - @patch('os.path.abspath', return_value='') + @patch('memilio.epidata.getPopulationData.path_to_credential_file', return_value='./CredentialsRegio.ini') @patch('memilio.epidata.getPopulationData.read_population_data', return_value=df_pop_raw) @patch('memilio.epidata.getPopulationData.assign_population_data', return_value=df_pop) @patch('memilio.epidata.getPopulationData.test_total_population') @@ -124,7 +122,7 @@ def test_config_read(self, mock_test, mock_export, mock_read, mock_path): self.assertTrue(self.config_file_name in os.listdir(os.getcwd())) # The download and assigning to counties of the population data is mocked. gpd.get_population_data( - username=None, password=None, read_data=False, out_folder=self.path) + username=None, password=None, read_data=False, out_folder=self.path, interactive=False) # The file exist in the directory (mocked) and the credentials should be read. mock_read.assert_called_with( self.test_username, self.test_password, False, os.path.join(self.path, 'Germany')) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_simulation_data.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_simulation_data.py index 2f28b2754d..be62d9071e 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_simulation_data.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_simulation_data.py @@ -24,11 +24,13 @@ from memilio.epidata import defaultDict as dd from memilio.epidata import getSimulationData as gsd +from memilio.epidata import getDataIntoPandasDataFrame as gd class TestGetSimulationData(fake_filesystem_unittest.TestCase): # construct fake directory for testing maxDiff = None + gd.Conf.v_level = 'Debug' path = '/home/SumlationData' @@ -131,7 +133,7 @@ def test_errors( ' of the source material. Please report this as an issue. ' + 'vaccination' + ' data could not be stored correctly.') - exceptionprint = call('Exception: ') + exceptionprint = call('Error: Exception: ') expected_calls = [ exceptionprint, casesprint, exceptionprint, populprint, exceptionprint, diviprint, exceptionprint, vaccprint] diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_vaccination_data.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_vaccination_data.py index c4aa7b9b63..8d2fb86fe9 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_vaccination_data.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_vaccination_data.py @@ -28,7 +28,6 @@ from memilio.epidata import geoModificationGermany as geoger from memilio.epidata import getDataIntoPandasDataFrame as gd from memilio.epidata import getVaccinationData as gvd -from memilio.epidata import progress_indicator class TestGetVaccinationData(fake_filesystem_unittest.TestCase): @@ -106,12 +105,11 @@ class TestGetVaccinationData(fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - progress_indicator.ProgressIndicator.disable_indicators(True) @patch('memilio.epidata.getVaccinationData.download_vaccination_data', return_value=df_vacc_data_altern) @patch('memilio.epidata.getPopulationData.get_population_data', return_value=df_pop) - @patch('builtins.input', return_value='y') + @patch('memilio.epidata.getDataIntoPandasDataFrame.user_choice', return_value=True) def test_get_vaccination_data_alternative_ages(self, mockin, mockp, mockv): gvd.get_vaccination_data(out_folder=self.path, read_data=True) @@ -124,7 +122,7 @@ def test_get_vaccination_data_alternative_ages(self, mockin, mockp, mockv): @patch('memilio.epidata.getVaccinationData.download_vaccination_data', return_value=df_vacc_data) @patch('memilio.epidata.getPopulationData.get_population_data', return_value=df_pop) - @patch('builtins.input', return_value='y') + @patch('memilio.epidata.getDataIntoPandasDataFrame.user_choice', return_value=True) def test_get_standard_vaccination_sanitize_3(self, mockin, mockp, mockv): gvd.get_vaccination_data(out_folder=self.path, sanitize_data=3, read_data=True) diff --git a/pycode/memilio-epidata/requirements-dev.txt b/pycode/memilio-epidata/requirements-dev.txt index f0b677480a..c57c565136 100644 --- a/pycode/memilio-epidata/requirements-dev.txt +++ b/pycode/memilio-epidata/requirements-dev.txt @@ -1,6 +1,8 @@ # dev dependencies -# first support of python 3.11 -pyfakefs>=4.6 +# first support of python 3.11 is 4.6 +# 5.3.4 has conflicts with openpyxl +# 5.3.3 broken +pyfakefs>=4.6,<5.3.3 coverage>=7.0.1 # pylint 2.16 creates problem with wrapt package version pylint>=2.13.0,<2.16 diff --git a/pycode/memilio-epidata/setup.py b/pycode/memilio-epidata/setup.py index 23a3313d6f..ccf2598f0b 100644 --- a/pycode/memilio-epidata/setup.py +++ b/pycode/memilio-epidata/setup.py @@ -79,7 +79,10 @@ def run(self): install_requires=[ # smaller pandas versions contain a bug that sometimes prevents reading # some excel files (e.g. population or twitter data) - 'pandas>=2.0.0', + # Has to use less than 2.2.0, see Issue #910 + 'pandas>=2.0.0,<2.2.0', + # FutureWarning of pandas that pyarrow will be required in a future release + 'pyarrow', 'matplotlib', 'tables', # smaller numpy versions cause a security issue, 1.25 breaks testing with pyfakefs @@ -95,8 +98,10 @@ def run(self): ], extras_require={ 'dev': [ - # first support of python 3.11 - 'pyfakefs>=4.6', + # first support of python 3.11 4.6 + # 5.3.4 has conflicts with openpyxl + # 5.3.3 broken + 'pyfakefs>=4.6,<5.3.3', # coverage 7.0.0 can't find .whl files and breaks CI 'coverage>=7.0.1', # pylint 2.16 creates problem with wrapt package version