fix(sinan): use home dir to download sinan data (#209)

luabida · web-flow · commit 70fd64ba3d14 · 2023-02-07T14:47:38.000+01:00
* fix(sinan): use home dir to download sinan data

* removing tables split by years

* Delete poetry.lock

* Reseting poetry lock

* Fix tests

* linter

* Linter (unrelated)

* SINAN_DATA_PATH to PYSUS_DATA_PATH
diff --git a/docs/tutorials/forecast_switzerland/forecast_swiss.py b/docs/tutorials/forecast_switzerland/forecast_swiss.py
@@ -33,14 +33,14 @@ def get_clusters_swiss(t=0.3, end_date=None):
     """
     Params to get the list of clusters computed by the compute_cluster
     function.
- 
+
     Parameters
     ----------
     t : float
         Thereshold used in the clusterization.
     end_date : str
         Indicates the last day used to compute the cluster.
-    
+
     Returns
     -------
     Array
@@ -76,12 +76,12 @@ def get_clusters_swiss(t=0.3, end_date=None):
 def get_cluster_by_canton(canton):
     """
     Function to return the cluster that contains a specific canton.
- 
+
     Parameters
     ----------
     canton : str
         Name (two letters code) of the canton.
-    
+
     Returns
     -------
     List
@@ -99,11 +99,11 @@ def remove_zeros(tgt):
     """
     Function to remove the zeros of the target curve. It needs to be
     done to us be able to use the LogNormal dist.
- 
+
     Parameters
     ----------
     tgt : array
- 
+
     """
 
     tgt[tgt == 0] = 0.01
@@ -129,10 +129,10 @@ def train_eval_single_canton(
 ):
     """
     Function to train and evaluate the model for one georegion.
- 
+
     Important: * By default the function is using the clustering cantons
     and the
-      data since 2020. 
+      data since 2020.
     * For the predictor hospCapacity is used as predictor the column
       ICU_Covid19Patients.
 
@@ -177,9 +177,9 @@ def train_eval_single_canton(
     -------
     pd.DataFrame
         The return is a pandas DataFrame.
-    
+
     """
- 
+
     cluster_canton = [canton]  # get_cluster_by_canton(canton)
 
     target_name = f"{target_curve_name}_{canton}"
@@ -242,13 +242,13 @@ def train_eval_all_cantons(
 
     """
     Function to make prediction for all the cantons.
-    
+
     Important:
     * By default the function is using the clustering cantons and the
       data since 2020.
     * For the predictor hospCapacity is used as predictor the column
       ICU_Covid19Patients.
-    
+
     Parameters
     ----------
     target_curve_name : str
@@ -277,7 +277,7 @@ def train_eval_all_cantons(
     look_back : int
         Number of the last days that will be used to forecast the next
         days.
-    
+
     Returns
     -------
     pd.DataFrame
@@ -357,13 +357,13 @@ def train_single_canton(
 
     """
     Function to train and evaluate the model for one georegion.
-    
+
     Important: * By default the function is using the clustering cantons
     and the
       data since 2020.
     * For the predictor hospCapacity is used as predictor the column
       ICU_Covid19Patients.
-    
+
     Parameters
     ----------
     canton : str
@@ -378,7 +378,7 @@ def train_single_canton(
         Determines the beggining of the train dataset
     path : str
         Determines where the model trained will be saved.
-    update_data : bool 
+    update_data : bool
         Determines if the data from the Geneva hospital will be used.
         This params only is used when canton = GE and target_curve_name
         = hosp.
@@ -390,7 +390,7 @@ def train_single_canton(
     look_back : int
         Number of the last days that will be used to forecast the next
         days.
-    
+
     Returns
     -------
     None
@@ -449,17 +449,17 @@ def train_all_cantons(
     look_back=14,
     path=None,
 ):
- 
+
     """
     Function to train and evaluate the model for all the cantons in
     switzerland.
-    
-    Important: 
+
+    Important:
     * By default the function is using the clustering cantons and the
-      data since 2020. 
+      data since 2020.
     * For the predictor hospCapacity is used as predictor the column
       ICU_Covid19Patients.
-    
+
     Parameters
     ----------
     target_curve_name : str
@@ -480,14 +480,13 @@ def train_all_cantons(
     look_back : int
         Number of the last days that will be used to forecast the next
         days.
-    
+
     Returns
     -------
     pd.DataFrame
         Dataframe with the forecast for all the cantons.
     """
 
-
     clusters = get_clusters_swiss(t=0.6)
 
     for cluster in clusters:
@@ -541,13 +540,13 @@ def forecast_single_canton(
 ):
     """
     Function to make the forecast for one canton.
-    
+
     Important:
     * By default the function is using the clustering cantons and the
       data since 2020.
     * For the predictor hospCapacity is used as predictor the column
       ICU_Covid19Patients.
-    
+
     Parameters
     ----------
     target_curve_name : str
@@ -569,14 +568,13 @@ def forecast_single_canton(
     look_back : int
         Number of the last days that will be used to forecast the next
         days.
-    
+
     Returns
     -------
     pd.DataFrame
         Dataframe with the forecast for one canton.
     """
 
-
     cluster_canton = [canton]  # get_cluster_by_canton(canton)
 
     df = get_cluster_data(
@@ -609,13 +607,13 @@ def forecast_all_cantons(
 ):
     """
     Function to make the forecast for all the cantons.
-    
+
     Important:
     * By default the function is using the clustering cantons and the
       data since 2020.
     * For the predictor hospCapacity is used as predictor the column
       ICU_Covid19Patients.
-    
+
     Parameters
     ----------
     target_curve_name : str
@@ -630,7 +628,7 @@ def forecast_all_cantons(
         Determines from what day the forecast will be computed.
     path : str
         Indicates where the models trained are saved.
-    
+
     Returns
     -------
     pd.DataFrame
diff --git a/epigraphhub/analysis/forecast_models/metrics.py b/epigraphhub/analysis/forecast_models/metrics.py
@@ -13,15 +13,15 @@ def compute_metrics(df_pred: pd.DataFrame) -> pd.DataFrame:
     method in the train and test sample. The predictions must be saved
     in a dataset with the following columns: 'median', 'target' and
     'train_size'.
-    
-    This function uses the following metrics: 
 
-    - explained variance score; 
-    - mean absolute error; 
-    - mean squared error; 
-    - root mean squared error; 
-    - mean squared log error; 
-    - mean absolute percentage error. 
+    This function uses the following metrics:
+
+    - explained variance score;
+    - mean absolute error;
+    - mean squared error;
+    - root mean squared error;
+    - mean squared log error;
+    - mean absolute percentage error.
     To compute this metrics we use the implementations of the
     sklearn.metrics package.
 
diff --git a/epigraphhub/analysis/forecast_models/ngboost_models.py b/epigraphhub/analysis/forecast_models/ngboost_models.py
@@ -136,12 +136,12 @@ def train_eval(
         Returns
         -------
         pd.DataFrame
-            A DataFrame with four columns (and a date index): 
+            A DataFrame with four columns (and a date index):
 
-            - target: The target values. 
+            - target: The target values.
             - lower: The lower value of the confidence interval of 95%.
             - median: The median value of the confidence interval of
-              95%. 
+              95%.
             - upper: The upper value of the confidence interval of 95%.
             - train_size: The number of rows of data using as training
               data.
@@ -361,7 +361,7 @@ def forecast(
 
             - lower: The lower value of the confidence interval of 95%.
             - median: The median value of the confidence interval of
-              95%. 
+              95%.
             - upper: The upper value of the confidence interval of 95%.
         """
 
diff --git a/epigraphhub/analysis/preprocessing.py b/epigraphhub/analysis/preprocessing.py
@@ -187,9 +187,9 @@ def lstm_split_data(
     Returns
     -------
     Tuple[np.array,np.array,np.array,np.array]
-        X_train: array of features to train the model. 
-        y_train: array of targets to train the model. 
-        X_test: array of features to test the model. 
+        X_train: array of features to train the model.
+        y_train: array of targets to train the model.
+        X_test: array of features to test the model.
         y_test: array of targets to test the model.
     """
 
@@ -233,7 +233,7 @@ def normalize_data(
     Returns
     -------
     Tuple[pd.DataFrame, pd.Series]
-        pd.DataFrame: normalized DataFrame. 
+        pd.DataFrame: normalized DataFrame.
         pd.Series: Series of the max
         values used in the normalization.
     """
diff --git a/epigraphhub/data/_config.py b/epigraphhub/data/_config.py
@@ -4,6 +4,8 @@
 commonly used in data collection modules
 """
 
+from pathlib import Path
+
 # Colombia COVID data config:
 from sodapy import Socrata
 
@@ -27,3 +29,6 @@
 
 # SINAN data config:
 SINAN_LOG_PATH = "/tmp/sinan_fetch.log"
+_sinan_data = Path().home() / "pysus"
+_sinan_data.mkdir(exist_ok=True)
+PYSUS_DATA_PATH = str(_sinan_data)
diff --git a/epigraphhub/data/brasil/sinan/extract.py b/epigraphhub/data/brasil/sinan/extract.py
@@ -3,7 +3,7 @@
 from loguru import logger
 from pysus.online_data import SINAN
 
-from epigraphhub.data._config import SINAN_LOG_PATH
+from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH
 
 logger.add(SINAN_LOG_PATH, retention="7 days")
 
@@ -24,6 +24,6 @@ def download(disease: str):
         parquets_paths_list list(PosixPath) : A list with all parquets dirs.
     """
 
-    SINAN.download_all_years_in_chunks(disease)
+    SINAN.download_all_years_in_chunks(disease, data_dir=PYSUS_DATA_PATH)
 
-    logger.info(f"All years for {disease} downloaded at /tmp/pysus")
+    logger.info(f"All years for {disease} downloaded at {PYSUS_DATA_PATH}")
diff --git a/epigraphhub/data/brasil/sinan/loading.py b/epigraphhub/data/brasil/sinan/loading.py
@@ -6,7 +6,7 @@
 from pysus.online_data import parquets_to_dataframe as to_df
 
 from epigraphhub.connection import get_engine
-from epigraphhub.data._config import SINAN_LOG_PATH
+from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH
 from epigraphhub.settings import env
 
 logger.add(SINAN_LOG_PATH, retention="7 days")
@@ -17,22 +17,21 @@
 def upload():
     """
     Connects to the EGH SQL server and load all the chunks for all
-    diseases found at `/tmp/pysus` into database. This method cleans
+    diseases found at `$PYSUS_DATA_PATH` into database. This method cleans
     the chunks left.
 
     """
-    diseases_dir = Path("/tmp/pysus").glob("*")
+    diseases_dir = Path(PYSUS_DATA_PATH).glob("*")
     di_years_dir = [x for x in diseases_dir if x.is_dir()]
 
     for dir in di_years_dir:
-        if "parquet" in Path(dir).suffix:
-            df = to_df(str(dir), clean_after_read=True)
+        if "parquet" in Path(dir).suffix and any(os.listdir(dir)):
+            df = to_df(str(dir), clean_after_read=False)
             df.columns = df.columns.str.lower()
             df.index.name = "index"
 
             table_i = str(dir).split("/")[-1].split(".parquet")[0]
-            st, yr = table_i[:-4].lower(), table_i[-2:]
-            table = "".join([st, yr])
+            table = table_i[:-4].lower()
             schema = "brasil"
 
             with engine.connect() as conn:
@@ -53,3 +52,4 @@ def upload():
 
                 except Exception as e:
                     logger.error(f"Not able to upsert {table} \n{e}")
+                    raise e
diff --git a/tests/test_data/test_sinan_fetch.py b/tests/test_data/test_sinan_fetch.py