Skip to content

Commit 70fd64b

Browse files
authored
fix(sinan): use home dir to download sinan data (#209)
* fix(sinan): use home dir to download sinan data * removing tables split by years * Delete poetry.lock * Reseting poetry lock * Fix tests * linter * Linter (unrelated) * SINAN_DATA_PATH to PYSUS_DATA_PATH
1 parent ec69133 commit 70fd64b

File tree

8 files changed

+68
-68
lines changed

8 files changed

+68
-68
lines changed

docs/tutorials/forecast_switzerland/forecast_swiss.py

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ def get_clusters_swiss(t=0.3, end_date=None):
3333
"""
3434
Params to get the list of clusters computed by the compute_cluster
3535
function.
36-
36+
3737
Parameters
3838
----------
3939
t : float
4040
Thereshold used in the clusterization.
4141
end_date : str
4242
Indicates the last day used to compute the cluster.
43-
43+
4444
Returns
4545
-------
4646
Array
@@ -76,12 +76,12 @@ def get_clusters_swiss(t=0.3, end_date=None):
7676
def get_cluster_by_canton(canton):
7777
"""
7878
Function to return the cluster that contains a specific canton.
79-
79+
8080
Parameters
8181
----------
8282
canton : str
8383
Name (two letters code) of the canton.
84-
84+
8585
Returns
8686
-------
8787
List
@@ -99,11 +99,11 @@ def remove_zeros(tgt):
9999
"""
100100
Function to remove the zeros of the target curve. It needs to be
101101
done to us be able to use the LogNormal dist.
102-
102+
103103
Parameters
104104
----------
105105
tgt : array
106-
106+
107107
"""
108108

109109
tgt[tgt == 0] = 0.01
@@ -129,10 +129,10 @@ def train_eval_single_canton(
129129
):
130130
"""
131131
Function to train and evaluate the model for one georegion.
132-
132+
133133
Important: * By default the function is using the clustering cantons
134134
and the
135-
data since 2020.
135+
data since 2020.
136136
* For the predictor hospCapacity is used as predictor the column
137137
ICU_Covid19Patients.
138138
@@ -177,9 +177,9 @@ def train_eval_single_canton(
177177
-------
178178
pd.DataFrame
179179
The return is a pandas DataFrame.
180-
180+
181181
"""
182-
182+
183183
cluster_canton = [canton] # get_cluster_by_canton(canton)
184184

185185
target_name = f"{target_curve_name}_{canton}"
@@ -242,13 +242,13 @@ def train_eval_all_cantons(
242242

243243
"""
244244
Function to make prediction for all the cantons.
245-
245+
246246
Important:
247247
* By default the function is using the clustering cantons and the
248248
data since 2020.
249249
* For the predictor hospCapacity is used as predictor the column
250250
ICU_Covid19Patients.
251-
251+
252252
Parameters
253253
----------
254254
target_curve_name : str
@@ -277,7 +277,7 @@ def train_eval_all_cantons(
277277
look_back : int
278278
Number of the last days that will be used to forecast the next
279279
days.
280-
280+
281281
Returns
282282
-------
283283
pd.DataFrame
@@ -357,13 +357,13 @@ def train_single_canton(
357357

358358
"""
359359
Function to train and evaluate the model for one georegion.
360-
360+
361361
Important: * By default the function is using the clustering cantons
362362
and the
363363
data since 2020.
364364
* For the predictor hospCapacity is used as predictor the column
365365
ICU_Covid19Patients.
366-
366+
367367
Parameters
368368
----------
369369
canton : str
@@ -378,7 +378,7 @@ def train_single_canton(
378378
Determines the beggining of the train dataset
379379
path : str
380380
Determines where the model trained will be saved.
381-
update_data : bool
381+
update_data : bool
382382
Determines if the data from the Geneva hospital will be used.
383383
This params only is used when canton = GE and target_curve_name
384384
= hosp.
@@ -390,7 +390,7 @@ def train_single_canton(
390390
look_back : int
391391
Number of the last days that will be used to forecast the next
392392
days.
393-
393+
394394
Returns
395395
-------
396396
None
@@ -449,17 +449,17 @@ def train_all_cantons(
449449
look_back=14,
450450
path=None,
451451
):
452-
452+
453453
"""
454454
Function to train and evaluate the model for all the cantons in
455455
switzerland.
456-
457-
Important:
456+
457+
Important:
458458
* By default the function is using the clustering cantons and the
459-
data since 2020.
459+
data since 2020.
460460
* For the predictor hospCapacity is used as predictor the column
461461
ICU_Covid19Patients.
462-
462+
463463
Parameters
464464
----------
465465
target_curve_name : str
@@ -480,14 +480,13 @@ def train_all_cantons(
480480
look_back : int
481481
Number of the last days that will be used to forecast the next
482482
days.
483-
483+
484484
Returns
485485
-------
486486
pd.DataFrame
487487
Dataframe with the forecast for all the cantons.
488488
"""
489489

490-
491490
clusters = get_clusters_swiss(t=0.6)
492491

493492
for cluster in clusters:
@@ -541,13 +540,13 @@ def forecast_single_canton(
541540
):
542541
"""
543542
Function to make the forecast for one canton.
544-
543+
545544
Important:
546545
* By default the function is using the clustering cantons and the
547546
data since 2020.
548547
* For the predictor hospCapacity is used as predictor the column
549548
ICU_Covid19Patients.
550-
549+
551550
Parameters
552551
----------
553552
target_curve_name : str
@@ -569,14 +568,13 @@ def forecast_single_canton(
569568
look_back : int
570569
Number of the last days that will be used to forecast the next
571570
days.
572-
571+
573572
Returns
574573
-------
575574
pd.DataFrame
576575
Dataframe with the forecast for one canton.
577576
"""
578577

579-
580578
cluster_canton = [canton] # get_cluster_by_canton(canton)
581579

582580
df = get_cluster_data(
@@ -609,13 +607,13 @@ def forecast_all_cantons(
609607
):
610608
"""
611609
Function to make the forecast for all the cantons.
612-
610+
613611
Important:
614612
* By default the function is using the clustering cantons and the
615613
data since 2020.
616614
* For the predictor hospCapacity is used as predictor the column
617615
ICU_Covid19Patients.
618-
616+
619617
Parameters
620618
----------
621619
target_curve_name : str
@@ -630,7 +628,7 @@ def forecast_all_cantons(
630628
Determines from what day the forecast will be computed.
631629
path : str
632630
Indicates where the models trained are saved.
633-
631+
634632
Returns
635633
-------
636634
pd.DataFrame

epigraphhub/analysis/forecast_models/metrics.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@ def compute_metrics(df_pred: pd.DataFrame) -> pd.DataFrame:
1313
method in the train and test sample. The predictions must be saved
1414
in a dataset with the following columns: 'median', 'target' and
1515
'train_size'.
16-
17-
This function uses the following metrics:
1816
19-
- explained variance score;
20-
- mean absolute error;
21-
- mean squared error;
22-
- root mean squared error;
23-
- mean squared log error;
24-
- mean absolute percentage error.
17+
This function uses the following metrics:
18+
19+
- explained variance score;
20+
- mean absolute error;
21+
- mean squared error;
22+
- root mean squared error;
23+
- mean squared log error;
24+
- mean absolute percentage error.
2525
To compute this metrics we use the implementations of the
2626
sklearn.metrics package.
2727

epigraphhub/analysis/forecast_models/ngboost_models.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,12 @@ def train_eval(
136136
Returns
137137
-------
138138
pd.DataFrame
139-
A DataFrame with four columns (and a date index):
139+
A DataFrame with four columns (and a date index):
140140
141-
- target: The target values.
141+
- target: The target values.
142142
- lower: The lower value of the confidence interval of 95%.
143143
- median: The median value of the confidence interval of
144-
95%.
144+
95%.
145145
- upper: The upper value of the confidence interval of 95%.
146146
- train_size: The number of rows of data using as training
147147
data.
@@ -361,7 +361,7 @@ def forecast(
361361
362362
- lower: The lower value of the confidence interval of 95%.
363363
- median: The median value of the confidence interval of
364-
95%.
364+
95%.
365365
- upper: The upper value of the confidence interval of 95%.
366366
"""
367367

epigraphhub/analysis/preprocessing.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,9 @@ def lstm_split_data(
187187
Returns
188188
-------
189189
Tuple[np.array,np.array,np.array,np.array]
190-
X_train: array of features to train the model.
191-
y_train: array of targets to train the model.
192-
X_test: array of features to test the model.
190+
X_train: array of features to train the model.
191+
y_train: array of targets to train the model.
192+
X_test: array of features to test the model.
193193
y_test: array of targets to test the model.
194194
"""
195195

@@ -233,7 +233,7 @@ def normalize_data(
233233
Returns
234234
-------
235235
Tuple[pd.DataFrame, pd.Series]
236-
pd.DataFrame: normalized DataFrame.
236+
pd.DataFrame: normalized DataFrame.
237237
pd.Series: Series of the max
238238
values used in the normalization.
239239
"""

epigraphhub/data/_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
commonly used in data collection modules
55
"""
66

7+
from pathlib import Path
8+
79
# Colombia COVID data config:
810
from sodapy import Socrata
911

@@ -27,3 +29,6 @@
2729

2830
# SINAN data config:
2931
SINAN_LOG_PATH = "/tmp/sinan_fetch.log"
32+
_sinan_data = Path().home() / "pysus"
33+
_sinan_data.mkdir(exist_ok=True)
34+
PYSUS_DATA_PATH = str(_sinan_data)

epigraphhub/data/brasil/sinan/extract.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from loguru import logger
44
from pysus.online_data import SINAN
55

6-
from epigraphhub.data._config import SINAN_LOG_PATH
6+
from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH
77

88
logger.add(SINAN_LOG_PATH, retention="7 days")
99

@@ -24,6 +24,6 @@ def download(disease: str):
2424
parquets_paths_list list(PosixPath) : A list with all parquets dirs.
2525
"""
2626

27-
SINAN.download_all_years_in_chunks(disease)
27+
SINAN.download_all_years_in_chunks(disease, data_dir=PYSUS_DATA_PATH)
2828

29-
logger.info(f"All years for {disease} downloaded at /tmp/pysus")
29+
logger.info(f"All years for {disease} downloaded at {PYSUS_DATA_PATH}")

epigraphhub/data/brasil/sinan/loading.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pysus.online_data import parquets_to_dataframe as to_df
77

88
from epigraphhub.connection import get_engine
9-
from epigraphhub.data._config import SINAN_LOG_PATH
9+
from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH
1010
from epigraphhub.settings import env
1111

1212
logger.add(SINAN_LOG_PATH, retention="7 days")
@@ -17,22 +17,21 @@
1717
def upload():
1818
"""
1919
Connects to the EGH SQL server and load all the chunks for all
20-
diseases found at `/tmp/pysus` into database. This method cleans
20+
diseases found at `$PYSUS_DATA_PATH` into database. This method cleans
2121
the chunks left.
2222
2323
"""
24-
diseases_dir = Path("/tmp/pysus").glob("*")
24+
diseases_dir = Path(PYSUS_DATA_PATH).glob("*")
2525
di_years_dir = [x for x in diseases_dir if x.is_dir()]
2626

2727
for dir in di_years_dir:
28-
if "parquet" in Path(dir).suffix:
29-
df = to_df(str(dir), clean_after_read=True)
28+
if "parquet" in Path(dir).suffix and any(os.listdir(dir)):
29+
df = to_df(str(dir), clean_after_read=False)
3030
df.columns = df.columns.str.lower()
3131
df.index.name = "index"
3232

3333
table_i = str(dir).split("/")[-1].split(".parquet")[0]
34-
st, yr = table_i[:-4].lower(), table_i[-2:]
35-
table = "".join([st, yr])
34+
table = table_i[:-4].lower()
3635
schema = "brasil"
3736

3837
with engine.connect() as conn:
@@ -53,3 +52,4 @@ def upload():
5352

5453
except Exception as e:
5554
logger.error(f"Not able to upsert {table} \n{e}")
55+
raise e

0 commit comments

Comments
 (0)