|
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | """Functions for pulling NSSP ER data.""" |
| 3 | +import copy |
3 | 4 | import logging |
| 5 | +import random |
| 6 | +import time |
| 7 | +from datetime import datetime, timedelta |
4 | 8 | from pathlib import Path |
5 | 9 | from typing import Optional |
| 10 | +from urllib.error import HTTPError |
6 | 11 |
|
7 | 12 | import pandas as pd |
8 | 13 | from delphi_utils import create_backup_csv |
|
11 | 16 | from .constants import MAIN_DATASET_ID, PRELIM_DATASET_ID, PRELIM_SIGNALS_MAP, PRELIM_TYPE_DICT, SIGNALS_MAP, TYPE_DICT |
12 | 17 |
|
13 | 18 |
|
14 | | -def pull_data(socrata_token: str, dataset_id: str): |
| 19 | +def check_last_updated(socrata_token, dataset_id, logger): |
| 20 | + """ |
| 21 | + Check last updated timestamp to determine if data should be pulled or not. |
| 22 | +
|
| 23 | + Note -- if the call to the API fails, the behavior is to treat the data as stale, |
| 24 | + as possibly having duplicate is preferable to missing data |
| 25 | +
|
| 26 | + Parameters |
| 27 | + ---------- |
| 28 | + socrata_token |
| 29 | + dataset_id |
| 30 | + logger |
| 31 | +
|
| 32 | + Returns bool |
| 33 | + ------- |
| 34 | +
|
| 35 | + """ |
| 36 | + recently_updated_source = True |
| 37 | + try: |
| 38 | + client = Socrata("data.cdc.gov", socrata_token) |
| 39 | + response = client.get_metadata(dataset_id) |
| 40 | + |
| 41 | + updated_timestamp = datetime.utcfromtimestamp(int(response["rowsUpdatedAt"])) |
| 42 | + now = datetime.utcnow() |
| 43 | + recently_updated_source = (now - updated_timestamp) < timedelta(days=1) |
| 44 | + |
| 45 | + prelim_prefix = "Preliminary " if dataset_id == PRELIM_DATASET_ID else "" |
| 46 | + if recently_updated_source: |
| 47 | + logger.info( |
| 48 | + f"{prelim_prefix}NHSN data was recently updated; Pulling data", updated_timestamp=updated_timestamp |
| 49 | + ) |
| 50 | + else: |
| 51 | + logger.info(f"{prelim_prefix}NHSN data is stale; Skipping", updated_timestamp=updated_timestamp) |
| 52 | + # pylint: disable=W0703 |
| 53 | + except Exception as e: |
| 54 | + logger.info("error while processing socrata metadata; treating data as stale", error=str(e)) |
| 55 | + return recently_updated_source |
| 56 | + |
| 57 | + |
| 58 | +def pull_data(socrata_token: str, dataset_id: str, backup_dir: str, logger): |
15 | 59 | """Pull data from Socrata API.""" |
16 | 60 | client = Socrata("data.cdc.gov", socrata_token) |
| 61 | + logger.info("Pulling data from Socrata API") |
17 | 62 | results = [] |
18 | 63 | offset = 0 |
19 | 64 | limit = 50000 # maximum limit allowed by SODA 2.0 |
20 | | - while True: |
| 65 | + # retry logic for 500 error |
| 66 | + try: |
21 | 67 | page = client.get(dataset_id, limit=limit, offset=offset) |
22 | | - if not page: |
23 | | - break # exit the loop if no more results |
| 68 | + except HTTPError as err: |
| 69 | + if err.code == 503: |
| 70 | + time.sleep(2 + random.randint(0, 1000) / 1000.0) |
| 71 | + page = client.get(dataset_id, limit=limit, offset=offset) |
| 72 | + else: |
| 73 | + logger.info("Error pulling data from Socrata API", error=str(err)) |
| 74 | + raise err |
| 75 | + |
| 76 | + while len(page) > 0: |
24 | 77 | results.extend(page) |
25 | 78 | offset += limit |
| 79 | + page = client.get(dataset_id, limit=limit, offset=offset) |
26 | 80 |
|
27 | | - df = pd.DataFrame.from_records(results) |
| 81 | + if results: |
| 82 | + df = pd.DataFrame.from_records(results) |
| 83 | + create_backup_csv(df, backup_dir, False, logger=logger) |
| 84 | + else: |
| 85 | + df = pd.DataFrame() |
28 | 86 | return df |
29 | 87 |
|
30 | 88 |
|
@@ -89,25 +147,33 @@ def pull_nhsn_data( |
89 | 147 | """ |
90 | 148 | # Pull data from Socrata API |
91 | 149 | df = ( |
92 | | - pull_data(socrata_token, dataset_id=MAIN_DATASET_ID) |
| 150 | + pull_data(socrata_token, MAIN_DATASET_ID, backup_dir, logger) |
93 | 151 | if not custom_run |
94 | 152 | else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=False) |
95 | 153 | ) |
96 | 154 |
|
97 | | - keep_columns = list(TYPE_DICT.keys()) |
| 155 | + recently_updated = True if custom_run else check_last_updated(socrata_token, MAIN_DATASET_ID, logger) |
98 | 156 |
|
99 | | - if not df.empty: |
100 | | - create_backup_csv(df, backup_dir, custom_run, logger=logger) |
| 157 | + keep_columns = list(TYPE_DICT.keys()) |
101 | 158 |
|
| 159 | + if not df.empty and recently_updated: |
102 | 160 | df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"}) |
| 161 | + filtered_type_dict = copy.deepcopy(TYPE_DICT) |
103 | 162 |
|
104 | 163 | for signal, col_name in SIGNALS_MAP.items(): |
105 | | - df[signal] = df[col_name] |
| 164 | + # older backups don't have certain columns |
| 165 | + try: |
| 166 | + df[signal] = df[col_name] |
| 167 | + except KeyError: |
| 168 | + logger.info("column not available in data", col_name=col_name) |
| 169 | + keep_columns.remove(signal) |
| 170 | + del filtered_type_dict[signal] |
106 | 171 |
|
107 | 172 | df = df[keep_columns] |
108 | 173 | df["geo_id"] = df["geo_id"].str.lower() |
109 | 174 | df.loc[df["geo_id"] == "usa", "geo_id"] = "us" |
110 | | - df = df.astype(TYPE_DICT) |
| 175 | + |
| 176 | + df = df.astype(filtered_type_dict) |
111 | 177 | else: |
112 | 178 | df = pd.DataFrame(columns=keep_columns) |
113 | 179 |
|
@@ -144,24 +210,31 @@ def pull_preliminary_nhsn_data( |
144 | 210 | pd.DataFrame |
145 | 211 | Dataframe as described above. |
146 | 212 | """ |
| 213 | + # Pull data from Socrata API |
147 | 214 | df = ( |
148 | | - pull_data(socrata_token, dataset_id=PRELIM_DATASET_ID) |
| 215 | + pull_data(socrata_token, PRELIM_DATASET_ID, backup_dir, logger) |
149 | 216 | if not custom_run |
150 | 217 | else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=True) |
151 | 218 | ) |
152 | 219 |
|
153 | 220 | keep_columns = list(PRELIM_TYPE_DICT.keys()) |
| 221 | + recently_updated = True if custom_run else check_last_updated(socrata_token, PRELIM_DATASET_ID, logger) |
154 | 222 |
|
155 | | - if not df.empty: |
156 | | - create_backup_csv(df, backup_dir, custom_run, sensor="prelim", logger=logger) |
157 | | - |
| 223 | + if not df.empty and recently_updated: |
158 | 224 | df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"}) |
| 225 | + filtered_type_dict = copy.deepcopy(PRELIM_TYPE_DICT) |
159 | 226 |
|
160 | 227 | for signal, col_name in PRELIM_SIGNALS_MAP.items(): |
161 | | - df[signal] = df[col_name] |
| 228 | + try: |
| 229 | + df[signal] = df[col_name] |
| 230 | + except KeyError: |
| 231 | + logger.info("column not available in data", col_name=col_name, signal=signal) |
| 232 | + keep_columns.remove(signal) |
| 233 | + del filtered_type_dict[signal] |
162 | 234 |
|
163 | 235 | df = df[keep_columns] |
164 | | - df = df.astype(PRELIM_TYPE_DICT) |
| 236 | + df = df.astype(filtered_type_dict) |
| 237 | + |
165 | 238 | df["geo_id"] = df["geo_id"].str.lower() |
166 | 239 | df.loc[df["geo_id"] == "usa", "geo_id"] = "us" |
167 | 240 | else: |
|
0 commit comments