From 264d4dee6130666c458298e3333af572dc67c15f Mon Sep 17 00:00:00 2001
From: Mohit Sharma <sharmamohit85627@gmail.com>
Date: Tue, 15 Oct 2024 21:38:49 +0530
Subject: [PATCH] fixed repeat function

---
 src/utils/repeats.py | 80 ++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/src/utils/repeats.py b/src/utils/repeats.py
index ebe1475..bd3ac9a 100644
--- a/src/utils/repeats.py
+++ b/src/utils/repeats.py
@@ -53,11 +53,13 @@
                "Jacinto C. Nascimento",
                "Diogo Araújo"]
 
+
 import os
 import logging
 import pandas as pd
 import warnings
 from urllib3.exceptions import NotOpenSSLWarning
+import argparse
 
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -65,46 +67,50 @@
 # Suppress warnings
 warnings.filterwarnings("ignore", category=NotOpenSSLWarning)
 
-# Mapping file name
-mapping_fn = "mamo_patients_mapping_data.csv"
-
-# Define paths
-root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
-mapping_csv = os.path.join(root_dir, "data-images-breast", "data", "mapping", mapping_fn)
-
-# Debugging output for paths
-logging.info(f"Mapping CSV: {mapping_csv}")
+def find_repeats(csv_file, columns):
+    """Find and print rows in the CSV where the specified columns have repeated values."""
+    logging.info(f"Loading data from {csv_file}")
+    try:
+        df = pd.read_csv(csv_file)
+        
+        for column in columns:
+            logging.info(f"Counting occurrences of values in column: {column}")
+            value_counts = df[column].value_counts()
+            
+            logging.info(f"Filtering repeated values in column: {column}")
+            repeated_values = value_counts[value_counts > 1].index
+            
+            if repeated_values.empty:
+                logging.info(f"No repeated values found in column: {column}")
+            else:
+                logging.info(f"Printing rows with repeated values in column: {column}")
+                repeated_rows = df[df[column].isin(repeated_values)]
+                print(f"\nRepeated rows in column '{column}':\n")
+                print(repeated_rows)
+                
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+
+def main(csv_filename, columns_to_check):
+    logging.info("Starting repeat detection...")
+    
+    # Define paths
+    root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+    mapping_csv = os.path.join(root_dir, "data-images-breast", "data", "mapping", csv_filename)
 
-# Define the column names you want to check for repeats
-columns_to_check = ['anonymized_patient_id', 'real_patient_id']  # List of columns to check
+    # Debugging output for paths
+    logging.info(f"Mapping CSV: {mapping_csv}")
 
-def find_repeats(csv_file, columns):
-  """Find and print rows in the CSV where the specified columns have repeated values."""
-  logging.info(f"Loading data from {csv_file}")
-  try:
-    df = pd.read_csv(csv_file)
+    find_repeats(mapping_csv, columns_to_check)
     
-    for column in columns:
-      logging.info(f"Counting occurrences of values in column: {column}")
-      value_counts = df[column].value_counts()
-      
-      logging.info(f"Filtering repeated values in column: {column}")
-      repeated_values = value_counts[value_counts > 1].index
-      
-      if repeated_values.empty:
-        logging.info(f"No repeated values found in column: {column}")
-      else:
-        logging.info(f"Printing rows with repeated values in column: {column}")
-        repeated_rows = df[df[column].isin(repeated_values)]
-        print(f"\nRepeated rows in column '{column}':\n")
-        print(repeated_rows)
-        
-  except Exception as e:
-    logging.error(f"An error occurred: {e}")
+    logging.info("Repeat detection complete!")
 
 if __name__ == '__main__':
-  logging.info("Starting repeat detection...")
-  find_repeats(mapping_csv, columns_to_check)
-  logging.info("Repeat detection complete!")
+    parser = argparse.ArgumentParser(description="Detect repeated values in specified columns of a CSV file.")
+    parser.add_argument("csv_filename", help="Name of the CSV file to analyze")
+    parser.add_argument("columns", nargs='+', help="Columns to check for repeated values")
+    args = parser.parse_args()
+
+    main(args.csv_filename, args.columns)
 
-# End of file
\ No newline at end of file
+# End of file