Skip to content

Commit 8c9db28

Browse files
committed
Merge branch 'main' into indexes-html-repr
2 parents 8b1bc7f + 89f7de8 commit 8c9db28

File tree

12 files changed

+288
-114
lines changed

12 files changed

+288
-114
lines changed

doc/whats-new.rst

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ Deprecations
4545
Bug fixes
4646
~~~~~~~~~
4747

48+
- Explicitly opening a file multiple times (e.g., after modifying it on disk)
49+
now reopens the file from scratch for h5netcdf and scipy netCDF backends,
50+
rather than reusing a cached version (:issue:`4240`, :issue:`4862`).
51+
By `Stephan Hoyer <https://github.com/shoyer>`_.
4852

4953
Documentation
5054
~~~~~~~~~~~~~
@@ -59,8 +63,11 @@ Internal Changes
5963
~~~~~~~~~~~~~~~~
6064
- Doctests fail on any warnings (:pull:`7166`)
6165
By `Maximilian Roos <https://github.com/max-sixty>`_.
62-
63-
66+
- Improve import time by lazy loading ``dask.distributed`` (:pull: `7172`).
67+
- Explicitly specify ``longdouble=False`` in :py:func:`cftime.date2num` when
68+
encoding times to preserve existing behavior and prevent future errors when it
69+
is eventually set to ``True`` by default in cftime (:pull:`7171`). By
70+
`Spencer Clark <https://github.com/spencerkclark>`_.
6471

6572
.. _whats-new.2022.10.0:
6673

xarray/backends/file_manager.py

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,21 @@
33
import contextlib
44
import io
55
import threading
6+
import uuid
67
import warnings
7-
from typing import Any
8+
from typing import Any, Hashable
89

910
from ..core import utils
1011
from ..core.options import OPTIONS
1112
from .locks import acquire
1213
from .lru_cache import LRUCache
1314

1415
# Global cache for storing open files.
15-
FILE_CACHE: LRUCache[str, io.IOBase] = LRUCache(
16+
FILE_CACHE: LRUCache[Any, io.IOBase] = LRUCache(
1617
maxsize=OPTIONS["file_cache_maxsize"], on_evict=lambda k, v: v.close()
1718
)
1819
assert FILE_CACHE.maxsize, "file cache must be at least size one"
1920

20-
2121
REF_COUNTS: dict[Any, int] = {}
2222

2323
_DEFAULT_MODE = utils.ReprObject("<unused>")
@@ -85,12 +85,13 @@ def __init__(
8585
kwargs=None,
8686
lock=None,
8787
cache=None,
88+
manager_id: Hashable | None = None,
8889
ref_counts=None,
8990
):
90-
"""Initialize a FileManager.
91+
"""Initialize a CachingFileManager.
9192
92-
The cache and ref_counts arguments exist solely to facilitate
93-
dependency injection, and should only be set for tests.
93+
The cache, manager_id and ref_counts arguments exist solely to
94+
facilitate dependency injection, and should only be set for tests.
9495
9596
Parameters
9697
----------
@@ -120,6 +121,8 @@ def __init__(
120121
global variable and contains non-picklable file objects, an
121122
unpickled FileManager objects will be restored with the default
122123
cache.
124+
manager_id : hashable, optional
125+
Identifier for this CachingFileManager.
123126
ref_counts : dict, optional
124127
Optional dict to use for keeping track the number of references to
125128
the same file.
@@ -129,13 +132,17 @@ def __init__(
129132
self._mode = mode
130133
self._kwargs = {} if kwargs is None else dict(kwargs)
131134

132-
self._default_lock = lock is None or lock is False
133-
self._lock = threading.Lock() if self._default_lock else lock
135+
self._use_default_lock = lock is None or lock is False
136+
self._lock = threading.Lock() if self._use_default_lock else lock
134137

135138
# cache[self._key] stores the file associated with this object.
136139
if cache is None:
137140
cache = FILE_CACHE
138141
self._cache = cache
142+
if manager_id is None:
143+
# Each call to CachingFileManager should separately open files.
144+
manager_id = str(uuid.uuid4())
145+
self._manager_id = manager_id
139146
self._key = self._make_key()
140147

141148
# ref_counts[self._key] stores the number of CachingFileManager objects
@@ -153,6 +160,7 @@ def _make_key(self):
153160
self._args,
154161
"a" if self._mode == "w" else self._mode,
155162
tuple(sorted(self._kwargs.items())),
163+
self._manager_id,
156164
)
157165
return _HashedSequence(value)
158166

@@ -223,20 +231,14 @@ def close(self, needs_lock=True):
223231
if file is not None:
224232
file.close()
225233

226-
def __del__(self):
227-
# If we're the only CachingFileManger referencing a unclosed file, we
228-
# should remove it from the cache upon garbage collection.
234+
def __del__(self) -> None:
235+
# If we're the only CachingFileManger referencing a unclosed file,
236+
# remove it from the cache upon garbage collection.
229237
#
230-
# Keeping our own count of file references might seem like overkill,
231-
# but it's actually pretty common to reopen files with the same
232-
# variable name in a notebook or command line environment, e.g., to
233-
# fix the parameters used when opening a file:
234-
# >>> ds = xarray.open_dataset('myfile.nc')
235-
# >>> ds = xarray.open_dataset('myfile.nc', decode_times=False)
236-
# This second assignment to "ds" drops CPython's ref-count on the first
237-
# "ds" argument to zero, which can trigger garbage collections. So if
238-
# we didn't check whether another object is referencing 'myfile.nc',
239-
# the newly opened file would actually be immediately closed!
238+
# We keep track of our own reference count because we don't want to
239+
# close files if another identical file manager needs it. This can
240+
# happen if a CachingFileManager is pickled and unpickled without
241+
# closing the original file.
240242
ref_count = self._ref_counter.decrement(self._key)
241243

242244
if not ref_count and self._key in self._cache:
@@ -249,30 +251,40 @@ def __del__(self):
249251

250252
if OPTIONS["warn_for_unclosed_files"]:
251253
warnings.warn(
252-
"deallocating {}, but file is not already closed. "
253-
"This may indicate a bug.".format(self),
254+
f"deallocating {self}, but file is not already closed. "
255+
"This may indicate a bug.",
254256
RuntimeWarning,
255257
stacklevel=2,
256258
)
257259

258260
def __getstate__(self):
259261
"""State for pickling."""
260-
# cache and ref_counts are intentionally omitted: we don't want to try
261-
# to serialize these global objects.
262-
lock = None if self._default_lock else self._lock
263-
return (self._opener, self._args, self._mode, self._kwargs, lock)
262+
# cache is intentionally omitted: we don't want to try to serialize
263+
# these global objects.
264+
lock = None if self._use_default_lock else self._lock
265+
return (
266+
self._opener,
267+
self._args,
268+
self._mode,
269+
self._kwargs,
270+
lock,
271+
self._manager_id,
272+
)
264273

265-
def __setstate__(self, state):
274+
def __setstate__(self, state) -> None:
266275
"""Restore from a pickle."""
267-
opener, args, mode, kwargs, lock = state
268-
self.__init__(opener, *args, mode=mode, kwargs=kwargs, lock=lock)
276+
opener, args, mode, kwargs, lock, manager_id = state
277+
self.__init__( # type: ignore
278+
opener, *args, mode=mode, kwargs=kwargs, lock=lock, manager_id=manager_id
279+
)
269280

270-
def __repr__(self):
281+
def __repr__(self) -> str:
271282
args_string = ", ".join(map(repr, self._args))
272283
if self._mode is not _DEFAULT_MODE:
273284
args_string += f", mode={self._mode!r}"
274-
return "{}({!r}, {}, kwargs={})".format(
275-
type(self).__name__, self._opener, args_string, self._kwargs
285+
return (
286+
f"{type(self).__name__}({self._opener!r}, {args_string}, "
287+
f"kwargs={self._kwargs}, manager_id={self._manager_id!r})"
276288
)
277289

278290

xarray/backends/locks.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@
1111
# no need to worry about serializing the lock
1212
SerializableLock = threading.Lock # type: ignore
1313

14-
try:
15-
from dask.distributed import Lock as DistributedLock
16-
except ImportError:
17-
DistributedLock = None # type: ignore
18-
1914

2015
# Locks used by multiple backends.
2116
# Neither HDF5 nor the netCDF-C library are thread-safe.
@@ -41,14 +36,6 @@ def _get_multiprocessing_lock(key):
4136
return multiprocessing.Lock()
4237

4338

44-
_LOCK_MAKERS = {
45-
None: _get_threaded_lock,
46-
"threaded": _get_threaded_lock,
47-
"multiprocessing": _get_multiprocessing_lock,
48-
"distributed": DistributedLock,
49-
}
50-
51-
5239
def _get_lock_maker(scheduler=None):
5340
"""Returns an appropriate function for creating resource locks.
5441
@@ -61,7 +48,23 @@ def _get_lock_maker(scheduler=None):
6148
--------
6249
dask.utils.get_scheduler_lock
6350
"""
64-
return _LOCK_MAKERS[scheduler]
51+
52+
if scheduler is None:
53+
return _get_threaded_lock
54+
elif scheduler == "threaded":
55+
return _get_threaded_lock
56+
elif scheduler == "multiprocessing":
57+
return _get_multiprocessing_lock
58+
elif scheduler == "distributed":
59+
# Lazy import distributed since it is can add a significant
60+
# amount of time to import
61+
try:
62+
from dask.distributed import Lock as DistributedLock
63+
except ImportError:
64+
DistributedLock = None # type: ignore
65+
return DistributedLock
66+
else:
67+
raise KeyError(scheduler)
6568

6669

6770
def _get_scheduler(get=None, collection=None) -> str | None:
@@ -128,15 +131,12 @@ def acquire(lock, blocking=True):
128131
if blocking:
129132
# no arguments needed
130133
return lock.acquire()
131-
elif DistributedLock is not None and isinstance(lock, DistributedLock):
132-
# distributed.Lock doesn't support the blocking argument yet:
133-
# https://github.com/dask/distributed/pull/2412
134-
return lock.acquire(timeout=0)
135134
else:
136135
# "blocking" keyword argument not supported for:
137136
# - threading.Lock on Python 2.
138137
# - dask.SerializableLock with dask v1.0.0 or earlier.
139138
# - multiprocessing.Lock calls the argument "block" instead.
139+
# - dask.distributed.Lock uses the blocking argument as the first one
140140
return lock.acquire(blocking)
141141

142142

xarray/coding/times.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,18 @@ def _encode_datetime_with_cftime(dates, units, calendar):
581581
dates = dates.astype("M8[us]").astype(datetime)
582582

583583
def encode_datetime(d):
584-
return np.nan if d is None else cftime.date2num(d, units, calendar)
584+
# Since netCDF files do not support storing float128 values, we ensure
585+
# that float64 values are used by setting longdouble=False in num2date.
586+
# This try except logic can be removed when xarray's minimum version of
587+
# cftime is at least 1.6.2.
588+
try:
589+
return (
590+
np.nan
591+
if d is None
592+
else cftime.date2num(d, units, calendar, longdouble=False)
593+
)
594+
except TypeError:
595+
return np.nan if d is None else cftime.date2num(d, units, calendar)
585596

586597
return np.array([encode_datetime(d) for d in dates.ravel()]).reshape(dates.shape)
587598

xarray/conventions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def maybe_encode_bools(var):
141141
):
142142
dims, data, attrs, encoding = _var_as_tuple(var)
143143
attrs["dtype"] = "bool"
144-
data = data.astype(dtype="i1", copy=True)
144+
data = duck_array_ops.astype(data, dtype="i1", copy=True)
145145
var = Variable(dims, data, attrs, encoding)
146146
return var
147147

0 commit comments

Comments
 (0)