cmu-delphi
diff --git a/‎_delphi_utils_python/data_proc/geomap/chng_county_groups.csv‎
Lines changed: 402 additions & 0 deletions b/‎_delphi_utils_python/data_proc/geomap/chng_county_groups.csv‎
Lines changed: 402 additions & 0 deletions
diff --git a/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py‎
Lines changed: 175 additions & 0 deletions b/‎_delphi_utils_python/data_proc/geomap/geo_data_proc.py‎
Lines changed: 175 additions & 0 deletions
@@ -33,15 +33,18 @@
 FIPS_PUERTO_RICO_POPULATION_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt?"
 STATE_HHS_FILE = "hhs.txt"
 ZIP_POP_MISSING_FILE = "zip_pop_filling.csv"
+CHNG_COUNTY_GROUPS_FILE = "chng_county_groups.csv"
 
 # Out files
 FIPS_STATE_OUT_FILENAME = "fips_state_table.csv"
 FIPS_MSA_OUT_FILENAME = "fips_msa_table.csv"
 FIPS_HRR_OUT_FILENAME = "fips_hrr_table.csv"
 FIPS_ZIP_OUT_FILENAME = "fips_zip_table.csv"
 FIPS_HHS_FILENAME = "fips_hhs_table.csv"
+FIPS_CHNGFIPS_OUT_FILENAME = "fips_chng-fips_table.csv"
 FIPS_POPULATION_OUT_FILENAME = "fips_pop.csv"
 
+CHNGFIPS_STATE_OUT_FILENAME = "chng-fips_state_table.csv"
 ZIP_HSA_OUT_FILENAME = "zip_hsa_table.csv"
 ZIP_HRR_OUT_FILENAME = "zip_hrr_table.csv"
 ZIP_FIPS_OUT_FILENAME = "zip_fips_table.csv"
@@ -475,6 +478,176 @@ def derive_zip_hhs_crosswalk():
     zip_state.sort_values(["zip", "hhs"]).to_csv(join(OUTPUT_DIR, ZIP_HHS_FILENAME), index=False)
 
 
+def derive_fips_chngfips_crosswalk():
+    """Build a crosswalk table for FIPS to CHNG FIPS."""
+    if not isfile(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME)):
+        derive_fips_state_crosswalk()
+
+    assign_county_groups()
+    county_groups = pd.read_csv(CHNG_COUNTY_GROUPS_FILE, dtype="string", index_col=False)
+    # Split list of county FIPS codes into separate columns.
+    county_groups = pd.concat(
+            [county_groups, county_groups.fips_list.str.split("|", expand=True)],
+            axis=1
+        ).drop(
+            columns = "fips_list"
+        )
+
+    # Change to long format.
+    county_groups = pd.melt(
+            county_groups,
+            id_vars = ["state_fips", "group"],
+            var_name = "county_num",
+            value_name = "fips"
+        ).drop(
+            columns="county_num"
+        ).dropna()
+
+    county_groups["state_fips"] = county_groups["state_fips"].str.zfill(2)
+    county_groups["group"] = county_groups["group"].str.zfill(2)
+    county_groups["fips"] = county_groups["fips"].str.zfill(5).astype("string")
+    # Combine state codes and group ids into a single FIPS code.
+    county_groups["chng-fips"] = county_groups["state_fips"] + "g" + county_groups["group"]
+
+    county_groups = county_groups[["fips", "chng-fips"]]
+    fips_to_state = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype="string", index_col=False)
+
+    # Get all the fips that aren't included in the chng groupings.
+    extra_fips_list = list(set(fips_to_state.fips) - set(county_groups.fips))
+    # Normal fips codes and CHNG fips codes are the same for ungrouped counties.
+    extra_fips_df = pd.DataFrame({"fips" : extra_fips_list, "chng-fips" : extra_fips_list}, dtype="string")
+
+    # Combine grouped and ungrouped counties.
+    pd.concat(
+        [county_groups, extra_fips_df]
+    ).sort_values(
+        ["fips", "chng-fips"]
+    ).to_csv(
+        join(OUTPUT_DIR, FIPS_CHNGFIPS_OUT_FILENAME), index=False
+    )
+
+
+def derive_chngfips_state_crosswalk():
+    """Build a crosswalk table for FIPS to CHNG FIPS."""
+    if not isfile(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME)):
+        derive_fips_state_crosswalk()
+
+    if not isfile(join(OUTPUT_DIR, FIPS_CHNGFIPS_OUT_FILENAME)):
+        derive_fips_chngfips_crosswalk()
+
+    fips_to_group = pd.read_csv(join(OUTPUT_DIR, FIPS_CHNGFIPS_OUT_FILENAME), dtype="string", index_col=False)
+    fips_to_state = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype="string", index_col=False)
+
+    group_to_state = fips_to_group.join(
+            fips_to_state.set_index("fips"), on="fips", how="left"
+        ).drop(
+            columns = "fips"
+        ).drop_duplicates(
+        ).sort_values(
+            ["chng-fips", "state_code"]
+        )
+    group_to_state.to_csv(join(OUTPUT_DIR, CHNGFIPS_STATE_OUT_FILENAME), index=False)
+
+
+def fetch_county_groups_spreadsheet():
+    # County mapping file is derived from
+    # https://docs.google.com/spreadsheets/d/1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0/edit#gid=871427657
+    sheet_id = "1PEce4CjjHbRM1Z5xEMNI6Xsq_b2kkCh0"
+    sheet_name = "groupings"
+    # Request sheet in CSV format via tag in URL.
+    url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
+
+    county_groups = pd.read_csv(
+            url, dtype="string", index_col=False
+        ).dropna(
+            how="all", axis=1
+        )
+    county_groups["state FIPS"] = county_groups["state FIPS"].astype(int)
+
+    # Counties belonging to each group are listed (as FIPS codes) in the "county
+    # FIPS grouping" column, concatenated and separated by the pipe "|".
+    new_names = {
+        "state FIPS": "state_fips",
+        "county FIPS grouping": "fips_list"
+    }
+
+    county_groups = county_groups.rename(
+            columns=new_names
+        )[new_names.values()]
+
+    return county_groups
+
+
+def assign_county_groups():
+    county_groups = fetch_county_groups_spreadsheet()
+
+    # If a county groups mapping file already exists in `data_proc/geomap`, we
+    # have to be careful to not reassign a group number to a different group.
+    # Group numbers must remain fixed, even if a given county group is no longer
+    # being used.
+    if isfile(CHNG_COUNTY_GROUPS_FILE):
+        old_county_groups = pd.read_csv(CHNG_COUNTY_GROUPS_FILE, dtype="string", index_col=False)
+        old_county_groups.group = old_county_groups.group.astype(int)
+        old_county_groups.state_fips = old_county_groups.state_fips.astype(int)
+
+        # Remove rows from county_groups if that `fips_list` value already
+        # exists in old_county_groups.
+        county_groups = county_groups[
+            ~county_groups.fips_list.isin(old_county_groups.fips_list)
+        ]
+
+        # If grouping file has no new rows, no need to process again.
+        if county_groups.empty:
+            return
+        # Grouping spreadsheet contains rows not seen in old, on-disk county
+        # groupings file. Combining the two is delicate. While the code below
+        # appears to work, it has not been formally tested and could be
+        # invalid for even small changes to the format of the input county
+        # groupings file.
+        else:
+            raise NotImplementedError(
+                "Can't combine old and new county groupings automatically, "
+                "code below is not tested or robust to changes in input format."
+                "We recommend manually working with the code below and the new"
+                "data in a REPL."
+            )
+
+        # Assign an incrementing integer to be the group id of each remaining
+        # county grouping within a state using the given sort order.
+        county_groups["group"] = county_groups.groupby("state_fips").cumcount() + 1
+
+        # Find max group number by state in old_county_groups, join on, and
+        # add max group number to group number.
+        max_group_by_state = old_county_groups.groupby(
+                "state_fips"
+            ).group.max(
+            ).reset_index(
+            ).rename(
+                columns = {"group": "max_group"}
+            )
+        county_groups = county_groups.join(
+                max_group_by_state.set_index("state_fips"),
+                how="left",
+                on="state_fips"
+            ).assign(
+                group = lambda x: x.group + x.max_group
+            ).drop(
+                ["max_group"], axis=1
+            )
+
+        # Combine old_county_groups and county_groups
+        county_groups = pd.concat([old_county_groups, county_groups])
+    else:
+        # Group numbers are 1-indexed.
+        county_groups["group"] = county_groups.groupby("state_fips").cumcount() + 1
+
+    county_groups.sort_values(
+            ["state_fips"], kind="stable"
+        ).to_csv(
+            CHNG_COUNTY_GROUPS_FILE, index=False
+        )
+
+
 def clear_dir(dir_path: str):
     for fname in listdir(dir_path):
         remove(join(dir_path, fname))
@@ -501,3 +674,5 @@ def clear_dir(dir_path: str):
     derive_zip_population_table()
     derive_fips_hhs_crosswalk()
     derive_zip_hhs_crosswalk()
+    derive_fips_chngfips_crosswalk()
+    derive_chngfips_state_crosswalk()