From 264d4dee6130666c458298e3333af572dc67c15f Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Tue, 15 Oct 2024 21:38:49 +0530 Subject: [PATCH] fixed repeat function --- src/utils/repeats.py | 80 ++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/src/utils/repeats.py b/src/utils/repeats.py index ebe1475..bd3ac9a 100644 --- a/src/utils/repeats.py +++ b/src/utils/repeats.py @@ -53,11 +53,13 @@ "Jacinto C. Nascimento", "Diogo Araújo"] + import os import logging import pandas as pd import warnings from urllib3.exceptions import NotOpenSSLWarning +import argparse # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @@ -65,46 +67,50 @@ # Suppress warnings warnings.filterwarnings("ignore", category=NotOpenSSLWarning) -# Mapping file name -mapping_fn = "mamo_patients_mapping_data.csv" - -# Define paths -root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) -mapping_csv = os.path.join(root_dir, "data-images-breast", "data", "mapping", mapping_fn) - -# Debugging output for paths -logging.info(f"Mapping CSV: {mapping_csv}") +def find_repeats(csv_file, columns): + """Find and print rows in the CSV where the specified columns have repeated values.""" + logging.info(f"Loading data from {csv_file}") + try: + df = pd.read_csv(csv_file) + + for column in columns: + logging.info(f"Counting occurrences of values in column: {column}") + value_counts = df[column].value_counts() + + logging.info(f"Filtering repeated values in column: {column}") + repeated_values = value_counts[value_counts > 1].index + + if repeated_values.empty: + logging.info(f"No repeated values found in column: {column}") + else: + logging.info(f"Printing rows with repeated values in column: {column}") + repeated_rows = df[df[column].isin(repeated_values)] + print(f"\nRepeated rows in column '{column}':\n") + print(repeated_rows) + + except Exception as e: + logging.error(f"An error occurred: {e}") + +def main(csv_filename, columns_to_check): + logging.info("Starting repeat detection...") + + # Define paths + root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) + mapping_csv = os.path.join(root_dir, "data-images-breast", "data", "mapping", csv_filename) -# Define the column names you want to check for repeats -columns_to_check = ['anonymized_patient_id', 'real_patient_id'] # List of columns to check + # Debugging output for paths + logging.info(f"Mapping CSV: {mapping_csv}") -def find_repeats(csv_file, columns): - """Find and print rows in the CSV where the specified columns have repeated values.""" - logging.info(f"Loading data from {csv_file}") - try: - df = pd.read_csv(csv_file) + find_repeats(mapping_csv, columns_to_check) - for column in columns: - logging.info(f"Counting occurrences of values in column: {column}") - value_counts = df[column].value_counts() - - logging.info(f"Filtering repeated values in column: {column}") - repeated_values = value_counts[value_counts > 1].index - - if repeated_values.empty: - logging.info(f"No repeated values found in column: {column}") - else: - logging.info(f"Printing rows with repeated values in column: {column}") - repeated_rows = df[df[column].isin(repeated_values)] - print(f"\nRepeated rows in column '{column}':\n") - print(repeated_rows) - - except Exception as e: - logging.error(f"An error occurred: {e}") + logging.info("Repeat detection complete!") if __name__ == '__main__': - logging.info("Starting repeat detection...") - find_repeats(mapping_csv, columns_to_check) - logging.info("Repeat detection complete!") + parser = argparse.ArgumentParser(description="Detect repeated values in specified columns of a CSV file.") + parser.add_argument("csv_filename", help="Name of the CSV file to analyze") + parser.add_argument("columns", nargs='+', help="Columns to check for repeated values") + args = parser.parse_args() + + main(args.csv_filename, args.columns) -# End of file \ No newline at end of file +# End of file