@@ -124,6 +124,33 @@ _PyMem_mi_page_is_safe_to_free(mi_page_t *page)
124124
125125}
126126
127+ #ifdef Py_GIL_DISABLED
128+
129+ // If we are deferring collection of more than this amount of memory for
130+ // mimalloc pages, advance the write sequence. Advancing allows these
131+ // pages to be re-used in a different thread or for a different size class.
132+ #define QSBR_PAGE_MEM_LIMIT 4096*20
133+
134+ // Return true if the global write sequence should be advanced for a mimalloc
135+ // page that is deferred from collection.
136+ static bool
137+ should_advance_qsbr_for_page (struct _qsbr_thread_state * qsbr , mi_page_t * page )
138+ {
139+ size_t bsize = mi_page_block_size (page );
140+ size_t page_size = page -> capacity * bsize ;
141+ if (page_size > QSBR_PAGE_MEM_LIMIT ) {
142+ qsbr -> deferred_page_memory = 0 ;
143+ return true;
144+ }
145+ qsbr -> deferred_page_memory += page_size ;
146+ if (qsbr -> deferred_page_memory > QSBR_PAGE_MEM_LIMIT ) {
147+ qsbr -> deferred_page_memory = 0 ;
148+ return true;
149+ }
150+ return false;
151+ }
152+ #endif
153+
127154static bool
128155_PyMem_mi_page_maybe_free (mi_page_t * page , mi_page_queue_t * pq , bool force )
129156{
@@ -139,7 +166,14 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
139166
140167 _PyMem_mi_page_clear_qsbr (page );
141168 page -> retire_expire = 0 ;
142- page -> qsbr_goal = _Py_qsbr_deferred_advance (tstate -> qsbr );
169+
170+ if (should_advance_qsbr_for_page (tstate -> qsbr , page )) {
171+ page -> qsbr_goal = _Py_qsbr_advance (tstate -> qsbr -> shared );
172+ }
173+ else {
174+ page -> qsbr_goal = _Py_qsbr_shared_next (tstate -> qsbr -> shared );
175+ }
176+
143177 llist_insert_tail (& tstate -> mimalloc .page_list , & page -> qsbr_node );
144178 return false;
145179 }
@@ -1141,8 +1175,44 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state)
11411175 }
11421176}
11431177
1178+
1179+ #ifdef Py_GIL_DISABLED
1180+
1181+ // For deferred advance on free: the number of deferred items before advancing
1182+ // the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
1183+ // want to process a chunk before it overflows.
1184+ #define QSBR_DEFERRED_LIMIT 127
1185+
1186+ // If the deferred memory exceeds 1 MiB, advance the write sequence. This
1187+ // helps limit memory usage due to QSBR delaying frees too long.
1188+ #define QSBR_FREE_MEM_LIMIT 1024*1024
1189+
1190+ // Return true if the global write sequence should be advanced for a deferred
1191+ // memory free.
1192+ static bool
1193+ should_advance_qsbr_for_free (struct _qsbr_thread_state * qsbr , size_t size )
1194+ {
1195+ if (size > QSBR_FREE_MEM_LIMIT ) {
1196+ qsbr -> deferred_count = 0 ;
1197+ qsbr -> deferred_memory = 0 ;
1198+ qsbr -> should_process = true;
1199+ return true;
1200+ }
1201+ qsbr -> deferred_count ++ ;
1202+ qsbr -> deferred_memory += size ;
1203+ if (qsbr -> deferred_count > QSBR_DEFERRED_LIMIT ||
1204+ qsbr -> deferred_memory > QSBR_FREE_MEM_LIMIT ) {
1205+ qsbr -> deferred_count = 0 ;
1206+ qsbr -> deferred_memory = 0 ;
1207+ qsbr -> should_process = true;
1208+ return true;
1209+ }
1210+ return false;
1211+ }
1212+ #endif
1213+
11441214static void
1145- free_delayed (uintptr_t ptr )
1215+ free_delayed (uintptr_t ptr , size_t size )
11461216{
11471217#ifndef Py_GIL_DISABLED
11481218 free_work_item (ptr , NULL , NULL );
@@ -1200,23 +1270,32 @@ free_delayed(uintptr_t ptr)
12001270 }
12011271
12021272 assert (buf != NULL && buf -> wr_idx < WORK_ITEMS_PER_CHUNK );
1203- uint64_t seq = _Py_qsbr_deferred_advance (tstate -> qsbr );
1273+ uint64_t seq ;
1274+ if (should_advance_qsbr_for_free (tstate -> qsbr , size )) {
1275+ seq = _Py_qsbr_advance (tstate -> qsbr -> shared );
1276+ }
1277+ else {
1278+ seq = _Py_qsbr_shared_next (tstate -> qsbr -> shared );
1279+ }
12041280 buf -> array [buf -> wr_idx ].ptr = ptr ;
12051281 buf -> array [buf -> wr_idx ].qsbr_goal = seq ;
12061282 buf -> wr_idx ++ ;
12071283
12081284 if (buf -> wr_idx == WORK_ITEMS_PER_CHUNK ) {
1285+ // Normally the processing of delayed items is done from the eval
1286+ // breaker. Processing here is a safety measure to ensure too much
1287+ // work does not accumulate.
12091288 _PyMem_ProcessDelayed ((PyThreadState * )tstate );
12101289 }
12111290#endif
12121291}
12131292
12141293void
1215- _PyMem_FreeDelayed (void * ptr )
1294+ _PyMem_FreeDelayed (void * ptr , size_t size )
12161295{
12171296 assert (!((uintptr_t )ptr & 0x01 ));
12181297 if (ptr != NULL ) {
1219- free_delayed ((uintptr_t )ptr );
1298+ free_delayed ((uintptr_t )ptr , size );
12201299 }
12211300}
12221301
@@ -1226,7 +1305,10 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
12261305{
12271306 assert (!((uintptr_t )ptr & 0x01 ));
12281307 if (ptr != NULL ) {
1229- free_delayed (((uintptr_t )ptr )|0x01 );
1308+ // We use 0 as the size since we don't have an easy way to know the
1309+ // actual size. If we are freeing many objects, the write sequence
1310+ // will be advanced due to QSBR_DEFERRED_LIMIT.
1311+ free_delayed (((uintptr_t )ptr )|0x01 , 0 );
12301312 }
12311313}
12321314#endif
@@ -1302,6 +1384,8 @@ _PyMem_ProcessDelayed(PyThreadState *tstate)
13021384 PyInterpreterState * interp = tstate -> interp ;
13031385 _PyThreadStateImpl * tstate_impl = (_PyThreadStateImpl * )tstate ;
13041386
1387+ tstate_impl -> qsbr -> should_process = false;
1388+
13051389 // Process thread-local work
13061390 process_queue (& tstate_impl -> mem_free_queue , tstate_impl , true, NULL , NULL );
13071391
0 commit comments