1515from typing import Optional , Tuple , Dict , Any , List
1616
1717from ..logger import get_logger
18- from ..config import MIN_FREE_MEMORY
1918
2019# Get logger
2120logger = get_logger ("locallab.utils.system" )
2221
22+ # System constants
23+ MIN_FREE_MEMORY = 2000 # Minimum required free memory in MB
24+ MIN_GPU_MEMORY = 4000 # Minimum required GPU memory in MB
2325
2426def get_system_memory () -> Tuple [int , int ]:
2527 """Get system memory information in MB"""
@@ -28,84 +30,41 @@ def get_system_memory() -> Tuple[int, int]:
2830 free_memory = vm .available // (1024 * 1024 ) # Convert to MB
2931 return total_memory , free_memory
3032
31-
3233def get_gpu_memory () -> Optional [Tuple [int , int ]]:
33- """Get GPU memory information in MB if available """
34+ """Get GPU memory information in MB"""
3435 if not TORCH_AVAILABLE or not torch .cuda .is_available ():
3536 return None
36-
37- # First try nvidia-ml-py3 (nvidia_smi)
38- try :
39- import nvidia_smi
40- nvidia_smi .nvmlInit ()
41- handle = nvidia_smi .nvmlDeviceGetHandleByIndex (0 )
42- info = nvidia_smi .nvmlDeviceGetMemoryInfo (handle )
4337
44- total_memory = info . total // ( 1024 * 1024 ) # Convert to MB
45- free_memory = info . free // ( 1024 * 1024 ) # Convert to MB
46-
47- nvidia_smi . nvmlShutdown ( )
38+ try :
39+ device = torch . cuda . current_device ()
40+ total_memory = torch . cuda . get_device_properties ( device ). total_memory // ( 1024 * 1024 ) # Convert to MB
41+ free_memory = total_memory - ( torch . cuda . memory_allocated () + torch . cuda . memory_reserved ()) // ( 1024 * 1024 )
4842 return total_memory , free_memory
49- except ImportError :
50- # If nvidia_smi not available, log at debug level to avoid noise
51- logger .debug ("nvidia-ml-py3 not installed, falling back to torch for GPU info" )
52- # Fall back to torch for basic info
53- try :
54- # Get basic info from torch
55- device = torch .cuda .current_device ()
56- total_memory = torch .cuda .get_device_properties (device ).total_memory // (1024 * 1024 )
57- # Note: torch doesn't provide free memory info easily, so we estimate
58- # by allocating a tensor and seeing what's available
59- torch .cuda .empty_cache ()
60- free_memory = total_memory # Optimistic starting point
61-
62- # Rough estimate - we can't get exact free memory from torch easily
63- return total_memory , free_memory
64- except Exception as torch_error :
65- logger .debug (f"Torch GPU memory check also failed: { str (torch_error )} " )
66- return None
6743 except Exception as e :
68- logger .debug (f"Failed to get detailed GPU memory info: { str (e )} " )
69- # Fall back to torch for basic info (same as ImportError case)
70- try :
71- device = torch .cuda .current_device ()
72- total_memory = torch .cuda .get_device_properties (device ).total_memory // (1024 * 1024 )
73- torch .cuda .empty_cache ()
74- free_memory = total_memory # Optimistic estimate
75- return total_memory , free_memory
76- except Exception :
77- return None
78-
44+ logger .warning (f"Failed to get GPU memory info: { e } " )
45+ return None
7946
80- def check_resource_availability (required_memory : int ) -> bool :
81- """Check if system has enough resources for the requested operation """
47+ def check_resource_availability (required_memory : int = MIN_FREE_MEMORY ) -> bool :
48+ """Check if system has enough resources"""
8249 _ , free_memory = get_system_memory ()
83-
84- # Check system memory
85- if free_memory < MIN_FREE_MEMORY :
86- logger .warning (f"Low system memory: { free_memory } MB available" )
50+ if free_memory < required_memory :
8751 return False
88-
89- # If GPU is available, check GPU memory
90- if TORCH_AVAILABLE and torch .cuda .is_available ():
91- gpu_memory = get_gpu_memory ()
92- if gpu_memory :
93- total_gpu , free_gpu = gpu_memory
94- if free_gpu < required_memory :
95- logger .warning (f"Insufficient GPU memory: { free_gpu } MB available, { required_memory } MB required" )
96- return False
97-
52+
53+ gpu_mem = get_gpu_memory ()
54+ if gpu_mem is not None :
55+ _ , free_gpu = gpu_mem
56+ if free_gpu < MIN_GPU_MEMORY :
57+ return False
58+
9859 return True
9960
100-
10161def get_device () -> str :
10262 """Get the device to use for computations."""
10363 if TORCH_AVAILABLE and torch .cuda .is_available ():
10464 return "cuda"
10565 else :
10666 return "cpu"
10767
108-
10968def format_model_size (size_in_bytes : int ) -> str :
11069 """Format model size in human-readable format"""
11170 for unit in ['B' , 'KB' , 'MB' , 'GB' ]:
@@ -114,7 +73,6 @@ def format_model_size(size_in_bytes: int) -> str:
11473 size_in_bytes /= 1024
11574 return f"{ size_in_bytes :.2f} TB"
11675
117-
11876def get_system_resources () -> Dict [str , Any ]:
11977 """Get system resource information"""
12078 resources = {
@@ -146,7 +104,6 @@ def get_system_resources() -> Dict[str, Any]:
146104
147105 return resources
148106
149-
150107def get_cpu_info () -> Dict [str , Any ]:
151108 """Get information about the CPU."""
152109 return {
@@ -155,7 +112,6 @@ def get_cpu_info() -> Dict[str, Any]:
155112 "usage" : psutil .cpu_percent (interval = 0.1 )
156113 }
157114
158-
159115def get_gpu_info () -> List [Dict [str , Any ]]:
160116 """Get detailed information about all available GPUs.
161117
@@ -231,7 +187,6 @@ def get_gpu_info() -> List[Dict[str, Any]]:
231187
232188 return gpu_info
233189
234-
235190def get_memory_info () -> Dict [str , Any ]:
236191 """Get information about the system memory."""
237192 mem = psutil .virtual_memory ()
@@ -242,8 +197,7 @@ def get_memory_info() -> Dict[str, Any]:
242197 "percent" : mem .percent
243198 }
244199
245-
246200# Add this function for backward compatibility
247201def get_system_info () -> Dict [str , Any ]:
248202 """Get system resource information (alias for get_system_resources)"""
249- return get_system_resources ()
203+ return get_system_resources ()
0 commit comments